-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add an option not to abort on cuda OOM #1110
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
build/ | ||
release/ | ||
build-*/ | ||
out/ | ||
tmp/ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -89,14 +89,18 @@ struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) { | |
return talloc; | ||
} | ||
|
||
void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) { | ||
enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) { | ||
size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor); | ||
size = GGML_PAD(size, talloc->alignment); | ||
|
||
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) { | ||
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", | ||
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate tensor '%s' (needed %zu, available %zu)\n", | ||
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset); | ||
#ifdef GGML_NO_ABORT_ON_OOM | ||
return GGML_STATUS_ALLOC_FAILED; | ||
#else | ||
GGML_ABORT("not enough space in the buffer"); | ||
#endif | ||
} | ||
|
||
void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset; | ||
|
@@ -105,6 +109,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso | |
assert(((uintptr_t)addr % talloc->alignment) == 0); | ||
|
||
ggml_backend_tensor_alloc(talloc->buffer, tensor, addr); | ||
return GGML_STATUS_SUCCESS; | ||
} | ||
|
||
// dynamic tensor allocator | ||
|
@@ -150,6 +155,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs | |
} | ||
#endif | ||
|
||
// Check with reviewer: could that function returns a ggm_status (offset being an arg) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function also does not allocate any (physical) memory, it is just calculating offsets within a buffer. If it fails, it means there is a bug somewhere else. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, but see it can still abort. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The abort is mostly a sanity check, it cannot happen if everything is working as expect. If it fails, it means there is a serious bug in ggml. |
||
static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) { | ||
size = aligned_offset(NULL, size, alloc->alignment); | ||
|
||
|
@@ -179,6 +185,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz | |
// this should never happen | ||
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", | ||
__func__, size, max_avail); | ||
// Note: no way to honor GGML_NO_ABORT_ON_OOM since that fn returns the offset, not a ggml_status | ||
GGML_ABORT("not enough space in the buffer"); | ||
} | ||
} | ||
|
@@ -378,6 +385,7 @@ struct ggml_gallocr { | |
}; | ||
|
||
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) { | ||
//GGML_LOG_TRACE("%s: nbufs=%d\n", __func__, n_bufs); | ||
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr)); | ||
GGML_ASSERT(galloc != NULL); | ||
|
||
|
@@ -670,7 +678,10 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr | |
} | ||
} | ||
|
||
// Returns true on success, false otherwise | ||
// Check with reviewers: any cons to return a ggml_status? | ||
Comment on lines
+681
to
+682
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be ok to change the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, retouching that PR. |
||
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { | ||
//GGML_LOG_TRACE("ggml_gallocr_reserve_n\n"); | ||
size_t min_hash_size = graph->n_nodes + graph->n_leafs; | ||
// add 25% margin to avoid hash collisions | ||
min_hash_size += min_hash_size / 4; | ||
|
@@ -865,6 +876,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph | |
return false; | ||
} | ||
|
||
// Check with reviewers: any cons to return a ggml_status here? | ||
bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) { | ||
if (ggml_gallocr_needs_realloc(galloc, graph)) { | ||
if (galloc->n_buffers == 1) { | ||
|
@@ -954,7 +966,9 @@ static bool alloc_tensor_range(struct ggml_context * ctx, | |
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) { | ||
if (t->data == NULL) { | ||
if (t->view_src == NULL) { | ||
ggml_tallocr_alloc(&tallocr, t); | ||
enum ggml_status s = ggml_tallocr_alloc(&tallocr, t); | ||
if (s != GGML_STATUS_SUCCESS) | ||
GGML_LOG_WARN("%s: failed to alloc tensor %s \n", __func__, t->name); | ||
} else if (t->buffer == NULL) { | ||
ggml_backend_view_init(t); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,7 +44,7 @@ extern "C" { | |
// base address of the buffer | ||
void * (*get_base) (ggml_backend_buffer_t buffer); | ||
// (optional) initialize a tensor in the buffer (eg. add tensor extras) | ||
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); | ||
enum ggml_status (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All backends that use this function will need to be updated. It would be preferable to open the PR in llama.cpp since it has much better CI. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok. To move forward step after step, would you accept a PR in llamacpp with just that init_tensor change? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. |
||
// tensor data access | ||
void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); | ||
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -79,18 +79,19 @@ | |
|
||
#define GGML_CUDA_MAX_STREAMS 8 | ||
|
||
[[noreturn]] | ||
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg); | ||
// Print the error. Will also abort if abort true | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure that the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would propose to keep the abort bool option since it s up to the developper to decide to allow abort or not. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As I mentioned below, I do not think there is any case where aborting on a CUDA call failure is acceptable. We must allow applications to deal with these errors, we can't just make their applications disappear without explanation when something unexpected happens. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hum, at least we agree that the abort is/was too brutal. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not necessary to extend the scope of the PR, you can leave the aborts on functions that don't have a way to return an error, like the buffer functions. However you will still need to catch the exceptions and turn them into a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hum, so something like
would be a nightmare as there are hundred of CUDA_CHECK calls in ggml-cuda.cu. Would nt it be simpler to add the throw in CUDA_CHECK_GEN
? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You don't need a static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) try {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
catch (const std::exception & e) {
GGML_ABORT("%s", e.what());
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If it was an easy refactor, we would have already done it. If you add an |
||
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg, bool abort); | ||
|
||
#define CUDA_CHECK_GEN(err, success, error_fn) \ | ||
#define CUDA_CHECK_GEN(err, success, error_fn, abort) \ | ||
do { \ | ||
auto err_ = (err); \ | ||
if (err_ != (success)) { \ | ||
ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \ | ||
ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_), abort); \ | ||
} \ | ||
} while (0) | ||
|
||
#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString) | ||
#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString, true) | ||
#define CUDA_CHECK_NO_ABORT(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString, false) | ||
|
||
#if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA) | ||
static const char * cublas_get_error_str(const cublasStatus_t err) { | ||
|
@@ -113,15 +114,16 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in | |
} | ||
#endif // CUDART_VERSION >= 12000 | ||
|
||
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str) | ||
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str, true) | ||
|
||
#if !defined(GGML_USE_HIP) | ||
static const char * cu_get_error_str(CUresult err) { | ||
const char * err_str; | ||
cuGetErrorString(err, &err_str); | ||
return err_str; | ||
} | ||
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str) | ||
// Will print error and abort | ||
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str, true) | ||
#endif | ||
|
||
#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think it is necessary to change this function, since it does not allocate any memory itself. All errors from this function can be prevented by ensuring that the buffer has enough space.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok, reverted