Skip to content

Commit

Permalink
metal : add ggml_upscale
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Dec 8, 2023
1 parent 6303f0d commit 473f254
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 4 deletions.
35 changes: 35 additions & 0 deletions src/ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@
GGML_METAL_DECL_KERNEL(rope_f16);
GGML_METAL_DECL_KERNEL(alibi_f32);
GGML_METAL_DECL_KERNEL(im2col_f16);
GGML_METAL_DECL_KERNEL(upscale_f32);
GGML_METAL_DECL_KERNEL(argsort_f32_i32_asc);
GGML_METAL_DECL_KERNEL(argsort_f32_i32_desc);
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
Expand Down Expand Up @@ -382,6 +383,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
GGML_METAL_ADD_KERNEL(rope_f16);
GGML_METAL_ADD_KERNEL(alibi_f32);
GGML_METAL_ADD_KERNEL(im2col_f16);
GGML_METAL_ADD_KERNEL(upscale_f32);
GGML_METAL_ADD_KERNEL(argsort_f32_i32_asc);
GGML_METAL_ADD_KERNEL(argsort_f32_i32_desc);
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
Expand Down Expand Up @@ -482,6 +484,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(rope_f16);
GGML_METAL_DEL_KERNEL(alibi_f32);
GGML_METAL_DEL_KERNEL(im2col_f16);
GGML_METAL_DEL_KERNEL(upscale_f32);
GGML_METAL_DEL_KERNEL(argsort_f32_i32_asc);
GGML_METAL_DEL_KERNEL(argsort_f32_i32_desc);
GGML_METAL_DEL_KERNEL(cpy_f32_f16);
Expand Down Expand Up @@ -819,6 +822,7 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
case GGML_OP_ALIBI:
case GGML_OP_ROPE:
case GGML_OP_IM2COL:
case GGML_OP_UPSCALE:
case GGML_OP_ARGSORT:
case GGML_OP_DUP:
case GGML_OP_CPY:
Expand Down Expand Up @@ -1850,6 +1854,37 @@ void ggml_metal_graph_compute(

[encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
} break;
case GGML_OP_UPSCALE:
{
GGML_ASSERT(src0->type == GGML_TYPE_F32);

const int sf = dst->op_params[0];

[encoder setComputePipelineState:ctx->pipeline_upscale_f32];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
[encoder setBytes:&sf length:sizeof(sf) atIndex:18];

const int nth = MIN(1024, ne0);

[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_ARGSORT:
{
GGML_ASSERT(src0->type == GGML_TYPE_F32);
Expand Down
48 changes: 44 additions & 4 deletions src/ggml-metal.metal
Original file line number Diff line number Diff line change
Expand Up @@ -1549,6 +1549,46 @@ kernel void kernel_im2col_f16(
}
}

kernel void kernel_upscale_f32(
device const char * src0,
device char * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant uint64_t & nb0,
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
constant int32_t & sf,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {

const int64_t i3 = tgpig.z;
const int64_t i2 = tgpig.y;
const int64_t i1 = tgpig.x;

const int64_t i03 = i3;
const int64_t i02 = i2;
const int64_t i01 = i1/sf;

device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);

for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
dst_ptr[i0] = src0_ptr[i0/sf];
}
}

// bitonic sort implementation following the CUDA kernels as reference
typedef void (argsort_t)(
device const float * x,
Expand Down Expand Up @@ -1918,9 +1958,9 @@ kernel void kernel_cpy_f32_q4_1(
}

kernel void kernel_concat(
device const char * src0,
device const char * src1,
device char * dst,
device const char * src0,
device const char * src1,
device char * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
Expand Down Expand Up @@ -1957,7 +1997,7 @@ kernel void kernel_concat(
const int64_t i12 = i02 % ne12;
const int64_t i11 = i01 % ne11;

device const char * src0_ptr = src0 + i03 * nb03 + i02 * nb02 + i01 * nb01 + tpitg.x*nb00;
device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00;
device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0;

Expand Down

0 comments on commit 473f254

Please sign in to comment.