diff --git a/.gitignore b/.gitignore index 4bf0fa095..ec4cf7f5f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ build/ +release/ build-*/ out/ tmp/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 75b5ea3b4..7c87f867d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,6 +87,7 @@ option(GGML_GPROF "ggml: enable gprof" # build option(GGML_FATAL_WARNINGS "ggml: enable -Werror flag" OFF) +option(GGML_NO_ABORT_ON_OOM "ggml: enable no abort on OOM (experimental)" OFF) # sanitizers option(GGML_SANITIZE_THREAD "ggml: enable thread sanitizer" OFF) diff --git a/include/ggml-alloc.h b/include/ggml-alloc.h index 23600eea9..2cb150fd2 100644 --- a/include/ggml-alloc.h +++ b/include/ggml-alloc.h @@ -19,7 +19,7 @@ struct ggml_tallocr { }; GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer); -GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor); +GGML_API enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor); // Graph allocator /* diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0002ac18a..e2be8db93 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -60,6 +60,10 @@ if (GGML_FATAL_WARNINGS) endif() endif() +if (GGML_NO_ABORT_ON_OOM) + add_compile_definitions(GGML_NO_ABORT_ON_OOM) +endif() + if (GGML_ALL_WARNINGS) if (NOT MSVC) list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c index 9a3bf9f29..e12b3ecce 100644 --- a/src/ggml-alloc.c +++ b/src/ggml-alloc.c @@ -89,14 +89,18 @@ struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) { return talloc; } -void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) { +enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) { size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor); size = GGML_PAD(size, talloc->alignment); if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) { - GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", + GGML_LOG_ERROR("%s: not enough space in the buffer to allocate tensor '%s' (needed %zu, available %zu)\n", __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset); +#ifdef GGML_NO_ABORT_ON_OOM + return GGML_STATUS_ALLOC_FAILED; +#else GGML_ABORT("not enough space in the buffer"); +#endif } void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset; @@ -105,6 +109,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso assert(((uintptr_t)addr % talloc->alignment) == 0); ggml_backend_tensor_alloc(talloc->buffer, tensor, addr); + return GGML_STATUS_SUCCESS; } // dynamic tensor allocator @@ -150,6 +155,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs } #endif +// Check with reviewer: could that function returns a ggm_status (offset being an arg) static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) { size = aligned_offset(NULL, size, alloc->alignment); @@ -179,6 +185,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz // this should never happen GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", __func__, size, max_avail); + // Note: no way to honor GGML_NO_ABORT_ON_OOM since that fn returns the offset, not a ggml_status GGML_ABORT("not enough space in the buffer"); } } @@ -378,6 +385,7 @@ struct ggml_gallocr { }; ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) { + //GGML_LOG_TRACE("%s: nbufs=%d\n", __func__, n_bufs); ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr)); GGML_ASSERT(galloc != NULL); @@ -670,7 +678,10 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr } } +// Returns true on success, false otherwise +// Check with reviewers: any cons to return a ggml_status? bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { + //GGML_LOG_TRACE("ggml_gallocr_reserve_n\n"); size_t min_hash_size = graph->n_nodes + graph->n_leafs; // add 25% margin to avoid hash collisions min_hash_size += min_hash_size / 4; @@ -865,6 +876,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph return false; } +// Check with reviewers: any cons to return a ggml_status here? bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) { if (ggml_gallocr_needs_realloc(galloc, graph)) { if (galloc->n_buffers == 1) { @@ -954,7 +966,9 @@ static bool alloc_tensor_range(struct ggml_context * ctx, for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) { if (t->data == NULL) { if (t->view_src == NULL) { - ggml_tallocr_alloc(&tallocr, t); + enum ggml_status s = ggml_tallocr_alloc(&tallocr, t); + if (s != GGML_STATUS_SUCCESS) + GGML_LOG_WARN("%s: failed to alloc tensor %s \n", __func__, t->name); } else if (t->buffer == NULL) { ggml_backend_view_init(t); } diff --git a/src/ggml-backend-impl.h b/src/ggml-backend-impl.h index d1c2d76d8..1781c27de 100644 --- a/src/ggml-backend-impl.h +++ b/src/ggml-backend-impl.h @@ -44,7 +44,7 @@ extern "C" { // base address of the buffer void * (*get_base) (ggml_backend_buffer_t buffer); // (optional) initialize a tensor in the buffer (eg. add tensor extras) - void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + enum ggml_status (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // tensor data access void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); diff --git a/src/ggml-cpu/amx/amx.cpp b/src/ggml-cpu/amx/amx.cpp index 5ec5263ce..8c8ade202 100644 --- a/src/ggml-cpu/amx/amx.cpp +++ b/src/ggml-cpu/amx/amx.cpp @@ -50,10 +50,11 @@ static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) { return (void *) (buffer->context); } -static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { +static ggml_status ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor); GGML_UNUSED(buffer); + return GGML_STATUS_SUCCESS; } static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, diff --git a/src/ggml-cpu/ggml-cpu-aarch64.cpp b/src/ggml-cpu/ggml-cpu-aarch64.cpp index b311a5b1c..8d7402fdb 100644 --- a/src/ggml-cpu/ggml-cpu-aarch64.cpp +++ b/src/ggml-cpu/ggml-cpu-aarch64.cpp @@ -4135,10 +4135,11 @@ static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(con return nullptr; } -static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { +static ggml_status ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { tensor->extra = (void *) const_cast(ggml_aarch64_get_optimal_repack_type(tensor)); GGML_UNUSED(buffer); + return GGML_STATUS_SUCCESS; } static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, diff --git a/src/ggml-cuda/common.cuh b/src/ggml-cuda/common.cuh index 174916bc9..4685c0b7f 100644 --- a/src/ggml-cuda/common.cuh +++ b/src/ggml-cuda/common.cuh @@ -79,18 +79,19 @@ #define GGML_CUDA_MAX_STREAMS 8 -[[noreturn]] -void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg); +// Print the error. Will also abort if abort true +void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg, bool abort); -#define CUDA_CHECK_GEN(err, success, error_fn) \ +#define CUDA_CHECK_GEN(err, success, error_fn, abort) \ do { \ auto err_ = (err); \ if (err_ != (success)) { \ - ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \ + ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_), abort); \ } \ } while (0) -#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString) +#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString, true) +#define CUDA_CHECK_NO_ABORT(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString, false) #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA) static const char * cublas_get_error_str(const cublasStatus_t err) { @@ -113,7 +114,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in } #endif // CUDART_VERSION >= 12000 -#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str) +#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str, true) #if !defined(GGML_USE_HIP) static const char * cu_get_error_str(CUresult err) { @@ -121,7 +122,8 @@ static const char * cu_get_error_str(CUresult err) { cuGetErrorString(err, &err_str); return err_str; } -#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str) +// Will print error and abort +#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str, true) #endif #if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA) diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu index bda10aec1..2fdce8b38 100644 --- a/src/ggml-cuda/ggml-cuda.cu +++ b/src/ggml-cuda/ggml-cuda.cu @@ -60,8 +60,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); -[[noreturn]] -void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) { +void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg, bool abort) { int id = -1; // in case cudaGetDevice fails (void)cudaGetDevice(&id); @@ -69,7 +68,8 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line); GGML_LOG_ERROR(" %s\n", stmt); // abort with GGML_ABORT to get a stack trace - GGML_ABORT(GGML_CUDA_NAME " error"); + if (abort) + GGML_ABORT(GGML_CUDA_NAME " error"); } // this is faster on Windows @@ -356,7 +356,14 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { size_t look_ahead_size = (size_t) (1.05 * size); look_ahead_size = 256 * ((look_ahead_size + 255)/256); ggml_cuda_set_device(device); - CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device)); + cudaError_t status = ggml_cuda_device_malloc(&ptr, look_ahead_size, device); +#ifdef GGML_NO_ABORT_ON_OOM + CUDA_CHECK_NO_ABORT(status); +#else + CUDA_CHECK(status); +#endif + if (!ptr) + return ptr; *actual_size = look_ahead_size; pool_size += look_ahead_size; #ifdef DEBUG_CUDA_MALLOC @@ -533,12 +540,12 @@ static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { return ctx->dev_ptr; } -static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { +static ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; if (tensor->view_src != NULL) { assert(tensor->view_src->buffer->buft == buffer->buft); - return; + return GGML_STATUS_SUCCESS; } if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) { @@ -551,6 +558,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size)); } } + return GGML_STATUS_SUCCESS; } static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { @@ -785,7 +793,7 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff GGML_UNUSED(buffer); } -static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { +static ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; @@ -816,8 +824,17 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf // FIXME: do not crash if cudaMalloc fails // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first ggml_cuda_set_device(id); - char * buf; - CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id)); + char * buf = NULL; + cudaError_t status = ggml_cuda_device_malloc((void**)&buf, size, id); +#ifdef GGML_NO_ABORT_ON_OOM + CUDA_CHECK_NO_ABORT(status); + if (status != cudaSuccess) + return GGML_STATUS_ALLOC_FAILED; +#else + CUDA_CHECK(status); +#endif + if (!buf) + return GGML_STATUS_ALLOC_FAILED; // set padding to 0 to avoid possible NaN values if (size > original_size) { @@ -831,6 +848,7 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf } } tensor->extra = extra; + return GGML_STATUS_SUCCESS; } static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { @@ -1685,7 +1703,7 @@ static __global__ void k_compute_batched_ptrs( ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3; } -static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static ggml_status ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); @@ -1707,6 +1725,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co // convert src1 to fp16 ggml_cuda_pool_alloc src1_f16_alloc(ctx.pool()); +#ifdef GGML_NO_ABORT_ON_OOM + if (!(src1_f16_alloc.ptr)) + return GGML_STATUS_ALLOC_FAILED; +#endif if (src1->type != GGML_TYPE_F16) { const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); const int64_t ne_src1 = ggml_nelements(src1); @@ -1835,9 +1857,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream); } + return GGML_STATUS_SUCCESS; } -static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static ggml_status ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft); bool use_mul_mat_vec = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16) @@ -1888,7 +1911,9 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // general KQ + KQV multi-batch without FlashAttention - ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); + ggml_status st = ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); + if (st != GGML_STATUS_SUCCESS) + return st; } else if (use_mul_mat_vec) { ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr); } else if (use_mul_mat_vec_q) { @@ -1898,6 +1923,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor } else { ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr); } + return GGML_STATUS_SUCCESS; } struct mmid_row_mapping { @@ -1954,7 +1980,7 @@ static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_origin } } -static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { +static ggml_status ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * ids = dst->src[2]; @@ -2014,7 +2040,9 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * src1_row.data = src1_original + i11*nb11 + i12*nb12; dst_row.data = dst_original + i1*nb1 + i2*nb2; - ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row); + ggml_status s = ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row); + if (s != GGML_STATUS_SUCCESS) + return s; } } } else { @@ -2076,7 +2104,9 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst_row.nb[2] = num_src1_rows*nb1; dst_row.nb[3] = num_src1_rows*nb1; - ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row); + ggml_status s = ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row); + if (s != GGML_STATUS_SUCCESS) + return s; { dim3 block_dims(std::min((unsigned int)ne0, 768u)); @@ -2090,8 +2120,10 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * } } } + return GGML_STATUS_SUCCESS; } +// Check with reviewers: any cons to return a ggml_status instead ? static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) { // why is this here instead of mul_mat? if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) { @@ -2219,11 +2251,13 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg GGML_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]); return false; } else { - ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst); + if (ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst) != GGML_STATUS_SUCCESS) + return false; } break; case GGML_OP_MUL_MAT_ID: - ggml_cuda_mul_mat_id(ctx, dst); + if (ggml_cuda_mul_mat_id(ctx, dst) != GGML_STATUS_SUCCESS) + return false; break; case GGML_OP_OUT_PROD: ggml_cuda_out_prod(ctx, dst); @@ -2627,7 +2661,8 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) { } #endif -static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, +// Check with reviewers: should nt it return a ggml_status ? +static bool evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, [[maybe_unused]] std::vector & ggml_cuda_cpy_fn_ptrs, bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) { @@ -2692,6 +2727,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx graph_evaluated_or_captured = true; #endif // USE_CUDA_GRAPH } + return true; } static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { diff --git a/src/ggml.c b/src/ggml.c index 3b4861542..89c412c72 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -1681,6 +1681,7 @@ void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes) { } struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) { + GGML_ASSERT(src); return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne); } @@ -2328,6 +2329,8 @@ struct ggml_tensor * ggml_concat( struct ggml_tensor * b, int dim) { GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS); + GGML_ASSERT(a); + GGML_ASSERT(b); int64_t ne[GGML_MAX_DIMS]; for (int d = 0; d < GGML_MAX_DIMS; ++d) { @@ -2695,6 +2698,8 @@ struct ggml_tensor * ggml_mul_mat( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { + GGML_ASSERT(a); + GGML_ASSERT(b); GGML_ASSERT(ggml_can_mul_mat(a, b)); GGML_ASSERT(!ggml_is_transposed(a)); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 5db778cd8..7c9876faa 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -159,6 +159,10 @@ endif() # undefine NDEBUG so asserts don't get disabled in tests add_definitions(-UNDEBUG) +if (GGML_NO_ABORT_ON_OOM) + add_compile_definitions(GGML_NO_ABORT_ON_OOM) +endif() + # # test-vec0 @@ -412,3 +416,11 @@ add_executable(${TEST_TARGET} ${TEST_TARGET}.c) target_link_libraries(${TEST_TARGET} PRIVATE ggml) add_test(NAME ${TEST_TARGET} COMMAND $) set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") + +if(GGML_NO_ABORT_ON_OOM) + set(TEST_TARGET test-oom) + add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) + target_link_libraries(${TEST_TARGET} PRIVATE ggml) + add_test(NAME ${TEST_TARGET} COMMAND $) + set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") +endif() diff --git a/tests/test-arange.cpp b/tests/test-arange.cpp index 4b7a98584..047ba4887 100644 --- a/tests/test-arange.cpp +++ b/tests/test-arange.cpp @@ -76,7 +76,7 @@ int main(int /*argc*/, const char** /*argv*/) { ggml_backend_cpu_set_n_threads(backend, n_threads); } - ggml_backend_graph_compute(backend, graph); + GGML_ASSERT(ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS); float * output = new float[ggml_nelements(t)]; ggml_backend_tensor_get(t, output, 0, ggml_nbytes(t)); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 4c5c4dd9c..5eaeba8aa 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -633,7 +633,9 @@ struct test_case { ggml_build_forward_expand(gf, out); // warmup run - ggml_backend_graph_compute(backend, gf); + ggml_status status = ggml_backend_graph_compute(backend, gf); + if (status != GGML_STATUS_SUCCESS) + printf("Warning: ggml_backend_graph_compute warmup failed: ggml status=%d \n", status); // determine number of runs int n_runs; diff --git a/tests/test-mul-mat.cpp b/tests/test-mul-mat.cpp index 578d3e786..59096a78c 100644 --- a/tests/test-mul-mat.cpp +++ b/tests/test-mul-mat.cpp @@ -151,8 +151,9 @@ struct ggml_tensor* compute(const test_model & model, ggml_gallocr_t allocr) { ggml_backend_cpu_set_n_threads(model.backend, n_threads); } - - ggml_backend_graph_compute(model.backend, gf); + ggml_status status = ggml_backend_graph_compute(model.backend, gf); + if (status != GGML_STATUS_SUCCESS) + return nullptr; //ggml_graph_print(gf); @@ -313,6 +314,10 @@ int main(void) } struct ggml_tensor * result = compute(model, allocr); + if (!result) { + printf("ggml_mul_mat: failed to compute graph"); + return EXIT_FAILURE; + } std::vector out_data(ggml_nelements(result)); diff --git a/tests/test-oom.cpp b/tests/test-oom.cpp new file mode 100644 index 000000000..6b556e99c --- /dev/null +++ b/tests/test-oom.cpp @@ -0,0 +1,228 @@ +#include +#include +#include +#include +#include +#include "ggml.h" +#include "ggml-backend.h" +#include "ggml-cpu.h" +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif + +struct test_model { + struct ggml_tensor * a = nullptr; + struct ggml_tensor * b = nullptr; + ggml_backend_t backend = NULL; + ggml_backend_buffer_t buffer = NULL; + struct ggml_context * ctx = nullptr; + int M =0, N=0, K=1; +}; + +// in MB +size_t getCudaFreeMem() { + size_t cudafree = 0; + size_t cudatotal = 0; + ggml_backend_cuda_get_device_memory(0, &cudafree, &cudatotal); + return cudafree/1024/1024; +} + +ggml_status load_model(test_model & model, unsigned S) { + size_t totalFreeMem = getCudaFreeMem(); + printf("%s: cuda free: %ld MB \n", __func__, totalFreeMem); + + // for a 2d matrix multiplication: K = shared dim, M=num rows for the left tensor A, N=num cols for the right tensor B + model.M = S; + model.N = S; + model.K = S; + printf("%s: M=%d N=%d K=%d \n", __func__, model.M, model.N, model.K); + + size_t buffer_size = 0; + { + buffer_size += (model.M * model.K) * ggml_type_size(GGML_TYPE_F32); // tensor a + buffer_size += (model.K * model.N) * ggml_type_size(GGML_TYPE_F32); // tensor b + buffer_size += (model.M * model.N) * ggml_type_size(GGML_TYPE_F32); // output tensor + buffer_size += 1024; // overhead + } + printf("%s: backend buffer size = %ld KB\n", __func__, buffer_size/1024); + + int num_tensors = 3; + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead() * num_tensors, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, // + }; + + // initialize the backend + printf("%s: using CUDA backend\n", __func__); + model.backend = ggml_backend_cuda_init(0); + if (!model.backend) { + printf("%s: ggml_backend_cuda_init() failed\n", __func__); + return GGML_STATUS_FAILED; + } + + model.buffer = ggml_backend_alloc_buffer(model.backend, buffer_size); + if (!model.buffer) { + return GGML_STATUS_ALLOC_FAILED; + } + + printf("%s: buffer allocated. cuda free: %ld MB \n", __func__, getCudaFreeMem()); + + // create context + model.ctx = ggml_init(params); + printf("%s: ctx created. cuda free: %ld MB \n", __func__, getCudaFreeMem()); + + // create tensors + printf("%s: creating input tensors...\n", __func__); + model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, model.K, model.M); + model.a->name[0] = 'A'; + //printf("Matrix A: [%i, %i]\n", K, M); + model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, model.K, model.N); + model.b->name[0] = 'B'; + //printf("Matrix B: [%i, %i]\n", K, N); + printf("%s: tensors (a&b) created. cuda free: %ld MB \n", __func__, getCudaFreeMem()); + + // create an allocator + struct ggml_tallocr alloc = ggml_tallocr_new(model.buffer); + + // alloc memory for a + ggml_status s = ggml_tallocr_alloc(&alloc, model.a); + if (s != GGML_STATUS_SUCCESS) + return s; + + // alloc memory for b + return ggml_tallocr_alloc(&alloc, model.b); +} + + +struct ggml_cgraph * build_graph(const test_model& model, ggml_tensor* a, ggml_tensor *b, unsigned repeat) { + printf("build_graph %d...\n", repeat); + static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + printf("%s: graph buf size: %ld KB\n", __func__, buf_size/1024); + static std::vector buf(buf_size); + + struct ggml_init_params params0 = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() + }; + + // create a temporally context to build the graph + struct ggml_context * ctx0 = ggml_init(params0); + if (!ctx0) { + printf("error: ggml_init returned null\n"); + return nullptr; + } + + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + if (!gf) + return nullptr; + + // zT = x @ yT + struct ggml_tensor * result = ggml_mul_mat(ctx0, a, ggml_cont(ctx0, b)); + if (!result) { + printf("error: ggml_mul_mat returned null\n"); + return nullptr; + } + + // z = (zT)T + struct ggml_tensor* T = ggml_transpose(ctx0, result); + if (!T) { + fprintf(stderr, "error: ggml_transpose returned null\n"); + return nullptr; + } + + struct ggml_tensor* c = ggml_cont(ctx0, T); + if (!c) { + fprintf(stderr, "error: ggml_cont returned null\n"); + return nullptr; + } + + std::vector outTensors; + outTensors.push_back(c); + for (unsigned i=0; i < repeat; i++) { + struct ggml_tensor * d = ggml_mul_mat(ctx0, outTensors.back(), ggml_cont(ctx0, outTensors.back())); + if (!d) { + printf("error: ggml_mul_mat returned null\n"); + return nullptr; + } + //printf("%s: matmul out: %s %ld %ld \n", __func__, d->name, d->ne[0], d->ne[1]); + outTensors.push_back(d); + c = ggml_concat(ctx0, c, d, 0); + } + + ggml_build_forward_expand(gf, c); + + // delete the temporally context used to build the graph + ggml_free(ctx0); + return gf; +} + +ggml_status compute(const test_model & model, ggml_gallocr_t allocr, unsigned repeat) { + printf("compute ...\n"); + printf("compute: free device mem: %ld MB\n", getCudaFreeMem()); + + ggml_tensor* ot = NULL; + ggml_tensor* left = model.a; + ggml_tensor* right = model.b; + + struct ggml_cgraph * gf = build_graph(model, left, right, repeat); + printf("conpute: graph built. free cuda mem: %ld MB\n", getCudaFreeMem()); + + // allocate tensors + if (!ggml_gallocr_alloc_graph(allocr, gf)) + return GGML_STATUS_ALLOC_FAILED; + + printf("%s: graph buf allocated. free device mem: %ld MB\n", __func__, getCudaFreeMem()); + + ggml_status status = ggml_backend_graph_compute(model.backend, gf); + if (status != GGML_STATUS_SUCCESS) + return status; + + ggml_graph_print(gf); + printf("compute: graph computed. free device mem: %ld MB\n", getCudaFreeMem()); + // in this case, the output tensor is the last one in the graph + ot = ggml_graph_node(gf, -1); + if (!ot) + return GGML_STATUS_FAILED; + printf("%s: output tensor shape: %ld x %ld name: %s\n", __func__, ot->ne[0], ot->ne[1], ot->name); + + return GGML_STATUS_SUCCESS; +} + + +int main(void) { +#ifndef GGML_USE_CUDA + fprintf(stderr, "note: test-oom ony implemented for the cuda backend at the moment"); + return 0; +#endif + +#ifndef GGML_NO_ABORT_ON_OOM + fprintf(stderr, "warning: skipping: test-oom requires a GGML_NO_ABORT_ON_OOM build\n"); + return 0; +#endif + + test_model model; + + ggml_status status = load_model(model, 8192); // will also init the backend + if (status != GGML_STATUS_SUCCESS) { + printf("failed to load model"); + return GGML_EXIT_ABORTED; + } + + ggml_gallocr_t allocr = NULL; + allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + if (!allocr) { + printf("Cannot ggml_gallocr_new\n"); + return GGML_EXIT_ABORTED; + } + + // will run multiple matmul in a lopp accumulating big output tensors. Should oom. + status = compute(model, allocr, 160); + if (status == GGML_STATUS_SUCCESS) { + printf("main: compute failed to oom (matmul too small to oom the GPU? for loop too smal ?)\n"); + return GGML_EXIT_ABORTED; + } + printf("main: compute correctly OOM: ggml status=%d expected: %d \n", status, GGML_STATUS_ALLOC_FAILED); + return GGML_EXIT_SUCCESS; +} diff --git a/tests/test-timestep_embedding.cpp b/tests/test-timestep_embedding.cpp index a55865973..e41942633 100644 --- a/tests/test-timestep_embedding.cpp +++ b/tests/test-timestep_embedding.cpp @@ -159,7 +159,7 @@ int main(int argc, const char** argv) { ggml_backend_cpu_set_n_threads(backend, n_threads); } - ggml_backend_graph_compute(backend, graph); + GGML_ASSERT(ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS); float * output = new float[ggml_nelements(t)]; ggml_backend_tensor_get(t, output, 0, ggml_nbytes(t));