diff --git a/.gitignore b/.gitignore
index 4bf0fa095..ec4cf7f5f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 build/
+release/
 build-*/
 out/
 tmp/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 75b5ea3b4..7c87f867d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -87,6 +87,7 @@ option(GGML_GPROF                  "ggml: enable gprof"
 
 # build
 option(GGML_FATAL_WARNINGS    "ggml: enable -Werror flag"    OFF)
+option(GGML_NO_ABORT_ON_OOM   "ggml: enable no abort on OOM (experimental)" OFF)
 
 # sanitizers
 option(GGML_SANITIZE_THREAD    "ggml: enable thread sanitizer"    OFF)
diff --git a/include/ggml-alloc.h b/include/ggml-alloc.h
index 23600eea9..2cb150fd2 100644
--- a/include/ggml-alloc.h
+++ b/include/ggml-alloc.h
@@ -19,7 +19,7 @@ struct ggml_tallocr {
 };
 
 GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
-GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
+GGML_API enum ggml_status    ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
 
 // Graph allocator
 /*
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 0002ac18a..e2be8db93 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -60,6 +60,10 @@ if (GGML_FATAL_WARNINGS)
     endif()
 endif()
 
+if (GGML_NO_ABORT_ON_OOM)
+    add_compile_definitions(GGML_NO_ABORT_ON_OOM)
+endif()
+
 if (GGML_ALL_WARNINGS)
     if (NOT MSVC)
         list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c
index 9a3bf9f29..e12b3ecce 100644
--- a/src/ggml-alloc.c
+++ b/src/ggml-alloc.c
@@ -89,14 +89,18 @@ struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
     return talloc;
 }
 
-void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
+enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
     size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
     size = GGML_PAD(size, talloc->alignment);
 
     if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
-        GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
+        GGML_LOG_ERROR("%s: not enough space in the buffer to allocate tensor '%s' (needed %zu, available %zu)\n",
                 __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
+#ifdef GGML_NO_ABORT_ON_OOM
+        return GGML_STATUS_ALLOC_FAILED;
+#else
         GGML_ABORT("not enough space in the buffer");
+#endif
     }
 
     void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
@@ -105,6 +109,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
     assert(((uintptr_t)addr % talloc->alignment) == 0);
 
     ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
+    return GGML_STATUS_SUCCESS;
 }
 
 // dynamic tensor allocator
@@ -150,6 +155,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
 }
 #endif
 
+// Check with reviewer: could that function returns a ggm_status (offset being an arg)
 static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
     size = aligned_offset(NULL, size, alloc->alignment);
 
@@ -179,6 +185,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
             // this should never happen
             GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
                     __func__, size, max_avail);
+            // Note: no way to honor GGML_NO_ABORT_ON_OOM since that fn returns the offset, not a ggml_status
             GGML_ABORT("not enough space in the buffer");
         }
     }
@@ -378,6 +385,7 @@ struct ggml_gallocr {
 };
 
 ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
+    //GGML_LOG_TRACE("%s: nbufs=%d\n", __func__, n_bufs);
     ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
     GGML_ASSERT(galloc != NULL);
 
@@ -670,7 +678,10 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
     }
 }
 
+// Returns true on success, false otherwise
+// Check with reviewers: any cons to return a ggml_status?
 bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
+    //GGML_LOG_TRACE("ggml_gallocr_reserve_n\n");
     size_t min_hash_size = graph->n_nodes + graph->n_leafs;
     // add 25% margin to avoid hash collisions
     min_hash_size += min_hash_size / 4;
@@ -865,6 +876,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
     return false;
 }
 
+// Check with reviewers: any cons to return a ggml_status here?
 bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
     if (ggml_gallocr_needs_realloc(galloc, graph)) {
         if (galloc->n_buffers == 1) {
@@ -954,7 +966,9 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
     for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
         if (t->data == NULL) {
             if (t->view_src == NULL) {
-                ggml_tallocr_alloc(&tallocr, t);
+                enum ggml_status s = ggml_tallocr_alloc(&tallocr, t);
+                if (s != GGML_STATUS_SUCCESS)
+                    GGML_LOG_WARN("%s: failed to alloc tensor %s \n", __func__, t->name);
             } else if (t->buffer == NULL) {
                 ggml_backend_view_init(t);
             }
diff --git a/src/ggml-backend-impl.h b/src/ggml-backend-impl.h
index d1c2d76d8..1781c27de 100644
--- a/src/ggml-backend-impl.h
+++ b/src/ggml-backend-impl.h
@@ -44,7 +44,7 @@ extern "C" {
         // base address of the buffer
         void *       (*get_base)     (ggml_backend_buffer_t buffer);
         // (optional) initialize a tensor in the buffer (eg. add tensor extras)
-        void         (*init_tensor)  (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+        enum ggml_status  (*init_tensor)  (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
         // tensor data access
         void         (*memset_tensor)(ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
         void         (*set_tensor)   (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
diff --git a/src/ggml-cpu/amx/amx.cpp b/src/ggml-cpu/amx/amx.cpp
index 5ec5263ce..8c8ade202 100644
--- a/src/ggml-cpu/amx/amx.cpp
+++ b/src/ggml-cpu/amx/amx.cpp
@@ -50,10 +50,11 @@ static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
     return (void *) (buffer->context);
 }
 
-static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+static ggml_status ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
     tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
 
     GGML_UNUSED(buffer);
+    return GGML_STATUS_SUCCESS;
 }
 
 static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
diff --git a/src/ggml-cpu/ggml-cpu-aarch64.cpp b/src/ggml-cpu/ggml-cpu-aarch64.cpp
index b311a5b1c..8d7402fdb 100644
--- a/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/src/ggml-cpu/ggml-cpu-aarch64.cpp
@@ -4135,10 +4135,11 @@ static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(con
     return nullptr;
 }
 
-static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+static ggml_status ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
     tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_aarch64_get_optimal_repack_type(tensor));
 
     GGML_UNUSED(buffer);
+    return GGML_STATUS_SUCCESS;
 }
 
 static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
diff --git a/src/ggml-cuda/common.cuh b/src/ggml-cuda/common.cuh
index 174916bc9..4685c0b7f 100644
--- a/src/ggml-cuda/common.cuh
+++ b/src/ggml-cuda/common.cuh
@@ -79,18 +79,19 @@
 
 #define GGML_CUDA_MAX_STREAMS 8
 
-[[noreturn]]
-void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
+// Print the error. Will also abort if abort true
+void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg, bool abort);
 
-#define CUDA_CHECK_GEN(err, success, error_fn)                                      \
+#define CUDA_CHECK_GEN(err, success, error_fn, abort)                                      \
      do {                                                                           \
         auto err_ = (err);                                                          \
         if (err_ != (success)) {                                                    \
-            ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_));    \
+            ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_), abort);    \
         }                                                                           \
     } while (0)
 
-#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
+#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString, true)
+#define CUDA_CHECK_NO_ABORT(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString, false)
 
 #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
     static const char * cublas_get_error_str(const cublasStatus_t err) {
@@ -113,7 +114,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
     }
 #endif // CUDART_VERSION >= 12000
 
-#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
+#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str, true)
 
 #if !defined(GGML_USE_HIP)
 static const char * cu_get_error_str(CUresult err) {
@@ -121,7 +122,8 @@ static const char * cu_get_error_str(CUresult err) {
     cuGetErrorString(err, &err_str);
     return err_str;
 }
-#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
+// Will print error and abort
+#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str, true)
 #endif
 
 #if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu
index bda10aec1..2fdce8b38 100644
--- a/src/ggml-cuda/ggml-cuda.cu
+++ b/src/ggml-cuda/ggml-cuda.cu
@@ -60,8 +60,7 @@
 
 static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 
-[[noreturn]]
-void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
+void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg, bool abort) {
     int id = -1; // in case cudaGetDevice fails
     (void)cudaGetDevice(&id);
 
@@ -69,7 +68,8 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
     GGML_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func, file, line);
     GGML_LOG_ERROR("  %s\n", stmt);
     // abort with GGML_ABORT to get a stack trace
-    GGML_ABORT(GGML_CUDA_NAME " error");
+    if (abort)
+        GGML_ABORT(GGML_CUDA_NAME " error");
 }
 
 // this is faster on Windows
@@ -356,7 +356,14 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
         size_t look_ahead_size = (size_t) (1.05 * size);
         look_ahead_size = 256 * ((look_ahead_size + 255)/256);
         ggml_cuda_set_device(device);
-        CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
+        cudaError_t status = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
+#ifdef GGML_NO_ABORT_ON_OOM
+        CUDA_CHECK_NO_ABORT(status);
+#else
+        CUDA_CHECK(status);
+#endif
+        if (!ptr)
+            return ptr;
         *actual_size = look_ahead_size;
         pool_size += look_ahead_size;
 #ifdef DEBUG_CUDA_MALLOC
@@ -533,12 +540,12 @@ static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
     return ctx->dev_ptr;
 }
 
-static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+static ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
 
     if (tensor->view_src != NULL) {
         assert(tensor->view_src->buffer->buft == buffer->buft);
-        return;
+        return GGML_STATUS_SUCCESS;
     }
 
     if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
@@ -551,6 +558,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
             CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
         }
     }
+    return GGML_STATUS_SUCCESS;
 }
 
 static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -785,7 +793,7 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
     GGML_UNUSED(buffer);
 }
 
-static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+static ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
 
     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
@@ -816,8 +824,17 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
         // FIXME: do not crash if cudaMalloc fails
         // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
         ggml_cuda_set_device(id);
-        char * buf;
-        CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
+        char * buf = NULL;
+        cudaError_t status = ggml_cuda_device_malloc((void**)&buf, size, id);
+#ifdef GGML_NO_ABORT_ON_OOM
+        CUDA_CHECK_NO_ABORT(status);
+        if (status != cudaSuccess)
+            return GGML_STATUS_ALLOC_FAILED;
+#else
+        CUDA_CHECK(status);
+#endif
+        if (!buf)
+            return GGML_STATUS_ALLOC_FAILED;
 
         // set padding to 0 to avoid possible NaN values
         if (size > original_size) {
@@ -831,6 +848,7 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
         }
     }
     tensor->extra = extra;
+    return GGML_STATUS_SUCCESS;
 }
 
 static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -1685,7 +1703,7 @@ static __global__ void k_compute_batched_ptrs(
     ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2 + i13*nbd3;
 }
 
-static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static ggml_status ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(!ggml_is_transposed(src0));
     GGML_ASSERT(!ggml_is_transposed(src1));
 
@@ -1707,6 +1725,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
 
     // convert src1 to fp16
     ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
+#ifdef GGML_NO_ABORT_ON_OOM
+    if (!(src1_f16_alloc.ptr))
+        return GGML_STATUS_ALLOC_FAILED;
+#endif
     if (src1->type != GGML_TYPE_F16) {
         const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
         const int64_t ne_src1 = ggml_nelements(src1);
@@ -1835,9 +1857,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
         const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
         to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
     }
+    return GGML_STATUS_SUCCESS;
 }
 
-static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static ggml_status ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
 
     bool use_mul_mat_vec   = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
@@ -1888,7 +1911,9 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
     } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
                && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
         // general KQ + KQV multi-batch without FlashAttention
-        ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
+        ggml_status st = ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
+        if (st != GGML_STATUS_SUCCESS)
+            return st;
     } else if (use_mul_mat_vec) {
         ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr);
     } else if (use_mul_mat_vec_q) {
@@ -1898,6 +1923,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
     } else {
         ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
     }
+    return GGML_STATUS_SUCCESS;
 }
 
 struct mmid_row_mapping {
@@ -1954,7 +1980,7 @@ static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_origin
     }
 }
 
-static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+static ggml_status ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const ggml_tensor * src0 = dst->src[0];
     const ggml_tensor * src1 = dst->src[1];
     const ggml_tensor * ids  = dst->src[2];
@@ -2014,7 +2040,9 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
                 src1_row.data = src1_original + i11*nb11 + i12*nb12;
                 dst_row.data  =  dst_original + i1*nb1   + i2*nb2;
 
-                ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
+                ggml_status s = ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
+                if (s != GGML_STATUS_SUCCESS)
+                    return s;
             }
         }
     } else {
@@ -2076,7 +2104,9 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
             dst_row.nb[2] = num_src1_rows*nb1;
             dst_row.nb[3] = num_src1_rows*nb1;
 
-            ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
+            ggml_status s = ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
+            if (s != GGML_STATUS_SUCCESS)
+                return s;
 
             {
                 dim3 block_dims(std::min((unsigned int)ne0, 768u));
@@ -2090,8 +2120,10 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
             }
         }
     }
+    return GGML_STATUS_SUCCESS;
 }
 
+// Check with reviewers: any cons to return a ggml_status instead ?
 static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
     // why is this here instead of mul_mat?
     if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
@@ -2219,11 +2251,13 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
                 GGML_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
                 return false;
             } else {
-                ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
+                if (ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst) != GGML_STATUS_SUCCESS)
+                    return false;
             }
             break;
         case GGML_OP_MUL_MAT_ID:
-            ggml_cuda_mul_mat_id(ctx, dst);
+            if (ggml_cuda_mul_mat_id(ctx, dst) != GGML_STATUS_SUCCESS)
+                return false;
             break;
         case GGML_OP_OUT_PROD:
             ggml_cuda_out_prod(ctx, dst);
@@ -2627,7 +2661,8 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
 }
 #endif
 
-static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
+// Check with reviewers: should nt it return a ggml_status ?
+static bool evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
    [[maybe_unused]] std::vector<void *> & ggml_cuda_cpy_fn_ptrs,  bool & graph_evaluated_or_captured, bool & use_cuda_graph,
     bool & cuda_graph_update_required) {
 
@@ -2692,6 +2727,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
         graph_evaluated_or_captured = true;
 #endif  // USE_CUDA_GRAPH
     }
+    return true;
 }
 
 static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
diff --git a/src/ggml.c b/src/ggml.c
index 3b4861542..89c412c72 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -1681,6 +1681,7 @@ void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes) {
 }
 
 struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
+    GGML_ASSERT(src);
     return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
 }
 
@@ -2328,6 +2329,8 @@ struct ggml_tensor * ggml_concat(
     struct ggml_tensor  * b,
     int                   dim) {
     GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
+    GGML_ASSERT(a);
+    GGML_ASSERT(b);
 
     int64_t ne[GGML_MAX_DIMS];
     for (int d = 0; d < GGML_MAX_DIMS; ++d) {
@@ -2695,6 +2698,8 @@ struct ggml_tensor * ggml_mul_mat(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b) {
+    GGML_ASSERT(a);
+    GGML_ASSERT(b);
     GGML_ASSERT(ggml_can_mul_mat(a, b));
     GGML_ASSERT(!ggml_is_transposed(a));
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 5db778cd8..7c9876faa 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -159,6 +159,10 @@ endif()
 # undefine NDEBUG so asserts don't get disabled in tests
 add_definitions(-UNDEBUG)
 
+if (GGML_NO_ABORT_ON_OOM)
+    add_compile_definitions(GGML_NO_ABORT_ON_OOM)
+endif()
+
 #
 # test-vec0
 
@@ -412,3 +416,11 @@ add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml)
 add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
 set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
+if(GGML_NO_ABORT_ON_OOM)
+    set(TEST_TARGET test-oom)
+    add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+    target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+    set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+endif()
diff --git a/tests/test-arange.cpp b/tests/test-arange.cpp
index 4b7a98584..047ba4887 100644
--- a/tests/test-arange.cpp
+++ b/tests/test-arange.cpp
@@ -76,7 +76,7 @@ int main(int /*argc*/, const char** /*argv*/) {
             ggml_backend_cpu_set_n_threads(backend, n_threads);
         }
 
-        ggml_backend_graph_compute(backend, graph);
+        GGML_ASSERT(ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS);
 
         float * output = new float[ggml_nelements(t)];
         ggml_backend_tensor_get(t, output, 0, ggml_nbytes(t));
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 4c5c4dd9c..5eaeba8aa 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -633,7 +633,9 @@ struct test_case {
         ggml_build_forward_expand(gf, out);
 
         // warmup run
-        ggml_backend_graph_compute(backend, gf);
+        ggml_status status = ggml_backend_graph_compute(backend, gf);
+        if (status != GGML_STATUS_SUCCESS)
+            printf("Warning: ggml_backend_graph_compute warmup failed: ggml status=%d \n", status);
 
         // determine number of runs
         int n_runs;
diff --git a/tests/test-mul-mat.cpp b/tests/test-mul-mat.cpp
index 578d3e786..59096a78c 100644
--- a/tests/test-mul-mat.cpp
+++ b/tests/test-mul-mat.cpp
@@ -151,8 +151,9 @@ struct ggml_tensor* compute(const test_model & model, ggml_gallocr_t allocr) {
         ggml_backend_cpu_set_n_threads(model.backend, n_threads);
     }
 
-
-    ggml_backend_graph_compute(model.backend, gf);
+    ggml_status status = ggml_backend_graph_compute(model.backend, gf);
+    if (status != GGML_STATUS_SUCCESS)
+        return nullptr;
 
     //ggml_graph_print(gf);
 
@@ -313,6 +314,10 @@ int main(void)
     }
 
     struct ggml_tensor * result = compute(model, allocr);
+    if (!result) {
+        printf("ggml_mul_mat: failed to compute graph");
+        return EXIT_FAILURE;
+    }
 
     std::vector<float> out_data(ggml_nelements(result));
 
diff --git a/tests/test-oom.cpp b/tests/test-oom.cpp
new file mode 100644
index 000000000..6b556e99c
--- /dev/null
+++ b/tests/test-oom.cpp
@@ -0,0 +1,228 @@
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <vector>
+#include "ggml.h"
+#include "ggml-backend.h"
+#include "ggml-cpu.h"
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+struct test_model {
+    struct ggml_tensor * a = nullptr;
+    struct ggml_tensor * b = nullptr;
+    ggml_backend_t backend = NULL;
+    ggml_backend_buffer_t buffer = NULL;
+    struct ggml_context * ctx = nullptr;
+    int M =0, N=0, K=1;
+};
+
+// in MB
+size_t getCudaFreeMem() {
+    size_t cudafree = 0;
+    size_t cudatotal = 0;
+    ggml_backend_cuda_get_device_memory(0, &cudafree, &cudatotal);
+    return cudafree/1024/1024;
+}
+
+ggml_status load_model(test_model & model, unsigned S) {
+    size_t totalFreeMem = getCudaFreeMem();
+    printf("%s: cuda free: %ld MB \n", __func__, totalFreeMem);
+
+    // for a 2d matrix multiplication: K = shared dim, M=num rows for the left tensor A, N=num cols for the right tensor B
+    model.M = S;
+    model.N = S;
+    model.K = S;
+    printf("%s: M=%d N=%d K=%d \n", __func__, model.M, model.N, model.K);
+
+    size_t buffer_size = 0;
+    {
+        buffer_size += (model.M * model.K) * ggml_type_size(GGML_TYPE_F32); // tensor a
+        buffer_size += (model.K * model.N) * ggml_type_size(GGML_TYPE_F32); // tensor b
+        buffer_size += (model.M * model.N) * ggml_type_size(GGML_TYPE_F32); // output tensor
+        buffer_size += 1024; // overhead
+    }
+    printf("%s: backend buffer size = %ld KB\n", __func__, buffer_size/1024);
+
+    int num_tensors = 3;
+    struct ggml_init_params params {
+        /*.mem_size   =*/ ggml_tensor_overhead() * num_tensors,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true, //
+            };
+
+    // initialize the backend
+    printf("%s: using CUDA backend\n", __func__);
+    model.backend = ggml_backend_cuda_init(0);
+    if (!model.backend) {
+        printf("%s: ggml_backend_cuda_init() failed\n", __func__);
+        return GGML_STATUS_FAILED;
+    }
+
+    model.buffer = ggml_backend_alloc_buffer(model.backend, buffer_size);
+    if (!model.buffer) {
+        return GGML_STATUS_ALLOC_FAILED;
+    }
+
+    printf("%s: buffer allocated. cuda free: %ld MB \n", __func__, getCudaFreeMem());
+
+    // create context
+    model.ctx = ggml_init(params);
+    printf("%s: ctx created. cuda free: %ld MB \n", __func__, getCudaFreeMem());
+
+    // create tensors
+    printf("%s: creating input tensors...\n", __func__);
+    model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, model.K, model.M);
+    model.a->name[0] = 'A';
+    //printf("Matrix A: [%i, %i]\n", K, M);
+    model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, model.K, model.N);
+    model.b->name[0] = 'B';
+    //printf("Matrix B: [%i, %i]\n", K, N);
+    printf("%s: tensors (a&b) created. cuda free: %ld MB \n", __func__, getCudaFreeMem());
+
+    // create an allocator
+    struct ggml_tallocr alloc = ggml_tallocr_new(model.buffer);
+
+    // alloc memory for a
+    ggml_status s = ggml_tallocr_alloc(&alloc, model.a);
+    if (s != GGML_STATUS_SUCCESS)
+        return s;
+
+    // alloc memory for b
+    return ggml_tallocr_alloc(&alloc, model.b);
+}
+
+
+struct ggml_cgraph * build_graph(const test_model& model, ggml_tensor* a, ggml_tensor *b, unsigned repeat) {
+    printf("build_graph %d...\n", repeat);
+    static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+    printf("%s: graph buf size: %ld KB\n", __func__, buf_size/1024);
+    static std::vector<uint8_t> buf(buf_size);
+
+    struct ggml_init_params params0 = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf.data(),
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
+    };
+
+    // create a temporally context to build the graph
+    struct ggml_context * ctx0 = ggml_init(params0);
+    if (!ctx0) {
+        printf("error: ggml_init returned null\n");
+        return nullptr;
+    }
+
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+    if (!gf)
+        return nullptr;
+
+    // zT = x @ yT
+    struct ggml_tensor * result = ggml_mul_mat(ctx0, a, ggml_cont(ctx0, b));
+    if (!result) {
+        printf("error:  ggml_mul_mat returned null\n");
+        return nullptr;
+    }
+
+    // z = (zT)T
+    struct ggml_tensor* T = ggml_transpose(ctx0, result);
+    if (!T) {
+        fprintf(stderr, "error: ggml_transpose returned null\n");
+        return nullptr;
+    }
+
+    struct ggml_tensor* c = ggml_cont(ctx0, T);
+    if (!c) {
+        fprintf(stderr, "error: ggml_cont returned null\n");
+        return nullptr;
+    }
+
+    std::vector<struct ggml_tensor *> outTensors;
+    outTensors.push_back(c);
+    for (unsigned i=0; i < repeat; i++) {
+        struct ggml_tensor * d = ggml_mul_mat(ctx0, outTensors.back(), ggml_cont(ctx0, outTensors.back()));
+        if (!d) {
+            printf("error:  ggml_mul_mat returned null\n");
+            return nullptr;
+        }
+        //printf("%s: matmul out: %s %ld %ld \n", __func__, d->name, d->ne[0], d->ne[1]);
+        outTensors.push_back(d);
+        c = ggml_concat(ctx0, c, d, 0);
+    }
+
+    ggml_build_forward_expand(gf, c);
+
+    // delete the temporally context used to build the graph
+    ggml_free(ctx0);
+    return gf;
+}
+
+ggml_status compute(const test_model & model, ggml_gallocr_t allocr, unsigned repeat) {
+    printf("compute ...\n");
+    printf("compute: free device mem: %ld MB\n", getCudaFreeMem());
+
+    ggml_tensor* ot = NULL;
+    ggml_tensor* left = model.a;
+    ggml_tensor* right = model.b;
+
+    struct ggml_cgraph * gf = build_graph(model, left, right, repeat);
+    printf("conpute: graph built. free cuda mem: %ld MB\n", getCudaFreeMem());
+
+    // allocate tensors
+    if (!ggml_gallocr_alloc_graph(allocr, gf))
+        return GGML_STATUS_ALLOC_FAILED;
+
+    printf("%s: graph buf allocated. free device mem: %ld MB\n", __func__, getCudaFreeMem());
+
+    ggml_status status = ggml_backend_graph_compute(model.backend, gf);
+    if (status != GGML_STATUS_SUCCESS)
+        return status;
+
+    ggml_graph_print(gf);
+    printf("compute: graph computed. free device mem: %ld MB\n", getCudaFreeMem());
+    // in this case, the output tensor is the last one in the graph
+    ot = ggml_graph_node(gf, -1);
+    if (!ot)
+        return GGML_STATUS_FAILED;
+    printf("%s: output tensor shape: %ld x %ld name: %s\n", __func__, ot->ne[0], ot->ne[1], ot->name);
+
+    return GGML_STATUS_SUCCESS;
+}
+
+
+int main(void) {
+#ifndef GGML_USE_CUDA
+    fprintf(stderr, "note: test-oom ony implemented for the cuda backend at the moment");
+    return 0;
+#endif
+
+#ifndef GGML_NO_ABORT_ON_OOM
+    fprintf(stderr, "warning: skipping: test-oom requires a GGML_NO_ABORT_ON_OOM build\n");
+    return 0;
+#endif
+
+    test_model model;
+
+    ggml_status status = load_model(model, 8192); // will also init the backend
+    if (status != GGML_STATUS_SUCCESS) {
+        printf("failed to load model");
+        return GGML_EXIT_ABORTED;
+    }
+
+    ggml_gallocr_t allocr = NULL;
+    allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
+    if (!allocr) {
+        printf("Cannot ggml_gallocr_new\n");
+        return GGML_EXIT_ABORTED;
+    }
+
+    // will run multiple matmul in a lopp accumulating big output tensors. Should oom.
+    status = compute(model, allocr, 160);
+    if (status == GGML_STATUS_SUCCESS) {
+        printf("main: compute failed to oom (matmul too small to oom the GPU? for loop too smal ?)\n");
+        return GGML_EXIT_ABORTED;
+    }
+    printf("main: compute correctly OOM: ggml status=%d expected: %d \n", status, GGML_STATUS_ALLOC_FAILED);
+    return GGML_EXIT_SUCCESS;
+}
diff --git a/tests/test-timestep_embedding.cpp b/tests/test-timestep_embedding.cpp
index a55865973..e41942633 100644
--- a/tests/test-timestep_embedding.cpp
+++ b/tests/test-timestep_embedding.cpp
@@ -159,7 +159,7 @@ int main(int argc, const char** argv) {
             ggml_backend_cpu_set_n_threads(backend, n_threads);
         }
 
-        ggml_backend_graph_compute(backend, graph);
+        GGML_ASSERT(ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS);
 
         float * output = new float[ggml_nelements(t)];
         ggml_backend_tensor_get(t, output, 0, ggml_nbytes(t));