@@ -1943,16 +1943,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1943
1943
&& ggml_nbytes (src0) != ggml_backend_buffer_get_alloc_size (src0->buffer , src0) && src0->view_src ;
1944
1944
1945
1945
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
1946
- && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1947
- && src0->ne [0 ] % 2 == 0 && src1->ne [1 ] == 1 ;
1946
+ && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
1948
1947
bool use_mul_mat_vec_q = ggml_is_quantized (src0->type ) && !bad_padding_clear
1949
1948
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1950
1949
&& src1->ne [1 ] <= MMVQ_MAX_BATCH_SIZE;
1951
1950
bool use_mul_mat_q = ggml_is_quantized (src0->type ) && !bad_padding_clear
1952
1951
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
1953
1952
1954
- bool any_gpus_with_slow_fp16 = false ;
1955
- bool any_gpus_without_fp16_mma = false ;
1953
+ bool any_gpus_with_slow_fp16 = false ;
1956
1954
1957
1955
if (split) {
1958
1956
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer ->buft ->context ;
@@ -1963,16 +1961,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1963
1961
continue ;
1964
1962
}
1965
1963
1966
- const int cc = ggml_cuda_info ().devices [id].cc ;
1967
- use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
1968
- any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || ! fast_fp16_hardware_available (cc );
1969
- any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available (cc);
1964
+ const int cc = ggml_cuda_info ().devices [id].cc ;
1965
+ use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
1966
+ use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv (src0-> type , cc, src0-> ne , src1-> ne [ 1 ] );
1967
+ any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available (cc);
1970
1968
}
1971
1969
} else {
1972
- const int cc = ggml_cuda_info ().devices [ctx.device ].cc ;
1973
- use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
1974
- any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || ! fast_fp16_hardware_available (cc );
1975
- any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available (cc);
1970
+ const int cc = ggml_cuda_info ().devices [ctx.device ].cc ;
1971
+ use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
1972
+ use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv (src0-> type , cc, src0-> ne , src1-> ne [ 1 ] );
1973
+ any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available (cc);
1976
1974
}
1977
1975
1978
1976
// debug helpers
@@ -1983,7 +1981,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1983
1981
// printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
1984
1982
// printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
1985
1983
1986
- if (!split && use_mul_mat_vec && (src0-> ne [ 1 ] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma) ) {
1984
+ if (!split && use_mul_mat_vec) {
1987
1985
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
1988
1986
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
1989
1987
ggml_cuda_mul_mat_vec (ctx, src0, src1, nullptr , dst);
0 commit comments