Skip to content

Commit 7474e00

Browse files
CUDA: fix crash with partial offloading of MoE (#13439)
1 parent 7f323a5 commit 7474e00

File tree

3 files changed

+12
-6
lines changed

3 files changed

+12
-6
lines changed

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1909,13 +1909,19 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
19091909
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
19101910
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
19111911

1912+
// If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
1913+
// But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
1914+
// Therefore, in such cases use cuBLAS.
1915+
const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
1916+
&& ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
1917+
19121918
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
19131919
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
19141920
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
1915-
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
1921+
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
19161922
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
19171923
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
1918-
bool use_mul_mat_q = ggml_is_quantized(src0->type)
1924+
bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
19191925
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
19201926

19211927
bool any_gpus_with_slow_fp16 = false;

ggml/src/ggml-cuda/mmq.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,11 +91,11 @@ void ggml_cuda_mul_mat_q(
9191

9292
// If src0 is a temporary compute buffer, clear any potential padding.
9393
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
94-
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
95-
GGML_ASSERT(!src0->view_src);
9694
const size_t size_data = ggml_nbytes(src0);
9795
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
9896
if (size_alloc > size_data) {
97+
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
98+
GGML_ASSERT(!src0->view_src);
9999
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
100100
}
101101
}

ggml/src/ggml-cuda/mmvq.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -515,11 +515,11 @@ void ggml_cuda_mul_mat_vec_q(
515515

516516
// If src0 is a temporary compute buffer, clear any potential padding.
517517
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
518-
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
519-
GGML_ASSERT(!src0->view_src);
520518
const size_t size_data = ggml_nbytes(src0);
521519
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
522520
if (size_alloc > size_data) {
521+
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
522+
GGML_ASSERT(!src0->view_src);
523523
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
524524
}
525525
}

0 commit comments

Comments
 (0)