Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an option not to abort on cuda OOM #1110

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
build/
release/
build-*/
out/
tmp/
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ option(GGML_GPROF "ggml: enable gprof"

# build
option(GGML_FATAL_WARNINGS "ggml: enable -Werror flag" OFF)
option(GGML_NO_ABORT_ON_OOM "ggml: enable no abort on OOM (experimental)" OFF)

# sanitizers
option(GGML_SANITIZE_THREAD "ggml: enable thread sanitizer" OFF)
Expand Down
2 changes: 1 addition & 1 deletion include/ggml-alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ struct ggml_tallocr {
};

GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
GGML_API enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it is necessary to change this function, since it does not allocate any memory itself. All errors from this function can be prevented by ensuring that the buffer has enough space.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, reverted


// Graph allocator
/*
Expand Down
4 changes: 4 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ if (GGML_FATAL_WARNINGS)
endif()
endif()

if (GGML_NO_ABORT_ON_OOM)
add_compile_definitions(GGML_NO_ABORT_ON_OOM)
endif()

if (GGML_ALL_WARNINGS)
if (NOT MSVC)
list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
Expand Down
20 changes: 17 additions & 3 deletions src/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,14 +89,18 @@ struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
return talloc;
}

void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
size = GGML_PAD(size, talloc->alignment);

if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate tensor '%s' (needed %zu, available %zu)\n",
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
#ifdef GGML_NO_ABORT_ON_OOM
return GGML_STATUS_ALLOC_FAILED;
#else
GGML_ABORT("not enough space in the buffer");
#endif
}

void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
Expand All @@ -105,6 +109,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
assert(((uintptr_t)addr % talloc->alignment) == 0);

ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
return GGML_STATUS_SUCCESS;
}

// dynamic tensor allocator
Expand Down Expand Up @@ -150,6 +155,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
}
#endif

// Check with reviewer: could that function returns a ggm_status (offset being an arg)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function also does not allocate any (physical) memory, it is just calculating offsets within a buffer. If it fails, it means there is a bug somewhere else.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, but see it can still abort.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The abort is mostly a sanity check, it cannot happen if everything is working as expect. If it fails, it means there is a serious bug in ggml.

static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
size = aligned_offset(NULL, size, alloc->alignment);

Expand Down Expand Up @@ -179,6 +185,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
// this should never happen
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
__func__, size, max_avail);
// Note: no way to honor GGML_NO_ABORT_ON_OOM since that fn returns the offset, not a ggml_status
GGML_ABORT("not enough space in the buffer");
}
}
Expand Down Expand Up @@ -378,6 +385,7 @@ struct ggml_gallocr {
};

ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
//GGML_LOG_TRACE("%s: nbufs=%d\n", __func__, n_bufs);
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
GGML_ASSERT(galloc != NULL);

Expand Down Expand Up @@ -670,7 +678,10 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
}
}

// Returns true on success, false otherwise
// Check with reviewers: any cons to return a ggml_status?
Comment on lines +681 to +682
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be ok to change the gallocr functions to return a ggml_status.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, retouching that PR.

bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
//GGML_LOG_TRACE("ggml_gallocr_reserve_n\n");
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
// add 25% margin to avoid hash collisions
min_hash_size += min_hash_size / 4;
Expand Down Expand Up @@ -865,6 +876,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
return false;
}

// Check with reviewers: any cons to return a ggml_status here?
bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
if (ggml_gallocr_needs_realloc(galloc, graph)) {
if (galloc->n_buffers == 1) {
Expand Down Expand Up @@ -954,7 +966,9 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) {
if (t->view_src == NULL) {
ggml_tallocr_alloc(&tallocr, t);
enum ggml_status s = ggml_tallocr_alloc(&tallocr, t);
if (s != GGML_STATUS_SUCCESS)
GGML_LOG_WARN("%s: failed to alloc tensor %s \n", __func__, t->name);
} else if (t->buffer == NULL) {
ggml_backend_view_init(t);
}
Expand Down
2 changes: 1 addition & 1 deletion src/ggml-backend-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ extern "C" {
// base address of the buffer
void * (*get_base) (ggml_backend_buffer_t buffer);
// (optional) initialize a tensor in the buffer (eg. add tensor extras)
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
enum ggml_status (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All backends that use this function will need to be updated. It would be preferable to open the PR in llama.cpp since it has much better CI.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok. To move forward step after step, would you accept a PR in llamacpp with just that init_tensor change?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes.

// tensor data access
void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
Expand Down
3 changes: 2 additions & 1 deletion src/ggml-cpu/amx/amx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,11 @@ static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
return (void *) (buffer->context);
}

static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
static ggml_status ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);

GGML_UNUSED(buffer);
return GGML_STATUS_SUCCESS;
}

static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
Expand Down
3 changes: 2 additions & 1 deletion src/ggml-cpu/ggml-cpu-aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4135,10 +4135,11 @@ static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(con
return nullptr;
}

static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
static ggml_status ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_aarch64_get_optimal_repack_type(tensor));

GGML_UNUSED(buffer);
return GGML_STATUS_SUCCESS;
}

static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
Expand Down
16 changes: 9 additions & 7 deletions src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -79,18 +79,19 @@

#define GGML_CUDA_MAX_STREAMS 8

[[noreturn]]
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
// Print the error. Will also abort if abort true
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure that the abort parameter is necessary. The cuBLAS functions may also allocate memory and fail (CUBLAS_CHECK).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would propose to keep the abort bool option since it s up to the developper to decide to allow abort or not.
For cublas, I could add a CUBLAS_CHECK_NO_ABORT() if you d like me too.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I mentioned below, I do not think there is any case where aborting on a CUDA call failure is acceptable. We must allow applications to deal with these errors, we can't just make their applications disappear without explanation when something unexpected happens.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hum, at least we agree that the abort is/was too brutal.
I have introduced the abort bool in order to make a difference between cuda failures that are today aborting and oom failures that are aborting too (as today) but for which we dont want to.
At the moment our goal is just to catch ooms, not to handle and forward upward all cuda failures (oom or not).
So you propose to extend the scope of that PR to all cuda failures, right?

Copy link
Member

@slaren slaren Feb 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not necessary to extend the scope of the PR, you can leave the aborts on functions that don't have a way to return an error, like the buffer functions. However you will still need to catch the exceptions and turn them into a GGML_ABORT. In the future we can extended the ggml API to return errors in more conditions. Adding an abort parameter is just going to add a lot of changes that will need to be reverted in the future anyway.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hum, so something like

try {
   CUDA_CHECK(dosomething());
} catch(std::exception) { GGML_ABORT(); }

would be a nightmare as there are hundred of CUDA_CHECK calls in ggml-cuda.cu.

Would nt it be simpler to add the throw in CUDA_CHECK_GEN

#define CUDA_CHECK_GEN(err, success, error_fn)                                      \
     do {                                                                           \
        auto err_ = (err);                                                          \
        if (err_ != (success)) {                                                    \
            ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_));    \
        }                                                                           \
       throw (err == oom ? std::bad_alloc(...) : std::runtime_error(...));
    } while (0)

?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You don't need a try..catch block for every CUDA_CHECK, only one for each ggml-backend interface function. For example:

static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) try {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;

    ggml_cuda_set_device(ctx->device);
    CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
catch (const std::exception & e) {
    GGML_ABORT("%s", e.what());
}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it was an easy refactor, we would have already done it. If you add an abort parameter to every CUDA_CHECK, you will be adding to the work that will need to be done in the future.

void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg, bool abort);

#define CUDA_CHECK_GEN(err, success, error_fn) \
#define CUDA_CHECK_GEN(err, success, error_fn, abort) \
do { \
auto err_ = (err); \
if (err_ != (success)) { \
ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \
ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_), abort); \
} \
} while (0)

#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString, true)
#define CUDA_CHECK_NO_ABORT(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString, false)

#if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
static const char * cublas_get_error_str(const cublasStatus_t err) {
Expand All @@ -113,15 +114,16 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
}
#endif // CUDART_VERSION >= 12000

#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str, true)

#if !defined(GGML_USE_HIP)
static const char * cu_get_error_str(CUresult err) {
const char * err_str;
cuGetErrorString(err, &err_str);
return err_str;
}
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
// Will print error and abort
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str, true)
#endif

#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
Expand Down
Loading