diff --git a/include/ggml.h b/include/ggml.h index 65cb92c44..1c8cc11b6 100644 --- a/include/ggml.h +++ b/include/ggml.h @@ -499,6 +499,7 @@ extern "C" { GGML_OP_POOL_2D_BACK, GGML_OP_UPSCALE, // nearest interpolate GGML_OP_PAD, + GGML_OP_PAD_REFLECT_1D, GGML_OP_ARANGE, GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_ARGSORT, @@ -1695,6 +1696,13 @@ extern "C" { int p2, int p3); + // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c] + GGML_API struct ggml_tensor * ggml_pad_reflect_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int p0, + int p1); + // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 // timesteps: [N,] // return: [N, dim] diff --git a/src/ggml-cpu/ggml-cpu.c b/src/ggml-cpu/ggml-cpu.c index 23ae2e10c..9dcdc4db1 100644 --- a/src/ggml-cpu/ggml-cpu.c +++ b/src/ggml-cpu/ggml-cpu.c @@ -10439,6 +10439,40 @@ static void ggml_compute_forward_pad( } } +// ggml_compute_forward_pad_reflect_1d + +static void ggml_compute_forward_pad_reflect_1d( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int ith = params->ith; + const int nth = params->nth; + + const int32_t * opts = (const int32_t *) dst->op_params; + const int p0 = opts[0]; + const int p1 = opts[1]; + + GGML_TENSOR_UNARY_OP_LOCALS + + for (int64_t i3 = 0; i3 < ne3; i3++) { + for (int64_t i2 = 0; i2 < ne2; i2++) { + for (int64_t i1 = ith; i1 < ne1; i1 += nth) { + float * left = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + p0*nb0); + float * right = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (ne0-p1-1)*nb0); + + ggml_vec_cpy_f32(ne00, left, (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01)); + + for (int i0 = 1; i0 <= p0; i0++) { left[-i0] = left[i0]; } + for (int i0 = 1; i0 <= p1; i0++) { right[i0] = right[-i0]; } + } + } + } +} // ggml_compute_forward_arange @@ -12535,6 +12569,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_pad(params, tensor); } break; + case GGML_OP_PAD_REFLECT_1D: + { + ggml_compute_forward_pad_reflect_1d(params, tensor); + } break; case GGML_OP_ARANGE: { ggml_compute_forward_arange(params, tensor); @@ -12877,6 +12915,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_UPSCALE: case GGML_OP_PAD: + case GGML_OP_PAD_REFLECT_1D: case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: diff --git a/src/ggml-metal/ggml-metal.m b/src/ggml-metal/ggml-metal.m index 093ae9000..71f5525f3 100644 --- a/src/ggml-metal/ggml-metal.m +++ b/src/ggml-metal/ggml-metal.m @@ -310,6 +310,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32, GGML_METAL_KERNEL_TYPE_UPSCALE_F32, GGML_METAL_KERNEL_TYPE_PAD_F32, + GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, GGML_METAL_KERNEL_TYPE_ARANGE_F32, GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, @@ -877,6 +878,7 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32, conv_transpose_1d_f16_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); @@ -1099,6 +1101,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_POOL_2D: case GGML_OP_UPSCALE: case GGML_OP_PAD: + case GGML_OP_PAD_REFLECT_1D: case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: @@ -3258,6 +3261,38 @@ static void ggml_metal_encode_node( const int nth = MIN(1024, ne0); + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_PAD_REFLECT_1D: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + const int32_t p0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t p1 = ((const int32_t *)(dst->op_params))[1]; + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:11]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:12]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:13]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:14]; + [encoder setBytes:&p0 length:sizeof(p0) atIndex:15]; + [encoder setBytes:&p1 length:sizeof(p1) atIndex:16]; + + const int nth = MIN(1024, ne0); + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case GGML_OP_ARANGE: diff --git a/src/ggml-metal/ggml-metal.metal b/src/ggml-metal/ggml-metal.metal index 5caa0846a..ee3e353ab 100644 --- a/src/ggml-metal/ggml-metal.metal +++ b/src/ggml-metal/ggml-metal.metal @@ -2897,6 +2897,53 @@ kernel void kernel_pad_f32( } } +kernel void kernel_pad_reflect_1d_f32( + device const char * src0, + device char * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant int64_t & ne0, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + constant int32_t & p0, + constant int32_t & p1, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const int64_t i3 = tgpig.z; + const int64_t i2 = tgpig.y; + const int64_t i1 = tgpig.x; + + const int64_t i03 = i3; + const int64_t i02 = i2; + const int64_t i01 = i1; + + device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01); + device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1); + + if (i1 < ne01 && i2 < ne02 && i3 < ne03) { + for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { + if (i0 < p0) { + dst_ptr[i0] = src0_ptr[p0 - i0]; + } else if (i0 < ne0 - p1) { + dst_ptr[i0] = src0_ptr[i0 - p0]; + } else { + dst_ptr[i0] = src0_ptr[(ne0 - p1 - p0) - (p1 + 1 - (ne0 - i0)) - 1]; + } + } + } +} + kernel void kernel_arange_f32( device char * dst, constant int64_t & ne0, diff --git a/src/ggml.c b/src/ggml.c index 1a9a7efaf..2c338dee5 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -950,6 +950,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "POOL_2D_BACK", "UPSCALE", "PAD", + "PAD_REFLECT_1D", "ARANGE", "TIMESTEP_EMBEDDING", "ARGSORT", @@ -983,7 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_ADAMW", }; -static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); +static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1045,6 +1046,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "pool_2d_back(x)", "upscale(x)", "pad(x)", + "pad_reflect_1d(x)", "arange(start, stop, step)", "timestep_embedding(timesteps, dim, max_period)", "argsort(x)", @@ -1078,7 +1080,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "adamw(x)", }; -static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); +static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4097,6 +4099,37 @@ struct ggml_tensor * ggml_pad( return result; } +// ggml_pad_reflect_1d + +struct ggml_tensor * ggml_pad_reflect_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int p0, + int p1) { + GGML_ASSERT(p0 >= 0); + GGML_ASSERT(p1 >= 0); + + GGML_ASSERT(p0 < a->ne[0]); // padding length on each size must be less than the + GGML_ASSERT(p1 < a->ne[0]); // existing length of the dimension being padded + + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(a->type == GGML_TYPE_F32); + + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, + a->ne[0] + p0 + p1, + a->ne[1], + a->ne[2], + a->ne[3]); + + int32_t params[] = { p0, p1 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_PAD_REFLECT_1D; + result->src[0] = a; + + return result; +} + // ggml_arange struct ggml_tensor * ggml_arange( diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e1ee785fa..5f8ee9c01 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -295,6 +295,13 @@ endif() add_test(NAME ${TEST_TARGET} COMMAND $) set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") +# +# test-pad-reflect-1d + +set(TEST_TARGET test-pad-reflect-1d) +add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +add_test(NAME ${TEST_TARGET} COMMAND $) # # test-conv-transpose diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 87c92dadd..da1bf809f 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2697,6 +2697,33 @@ struct test_pad : public test_case { } }; +// GGML_OP_PAD_REFLECT_1D +struct test_pad_reflect_1d : public test_case { + const ggml_type type; + const std::array ne_a; + const int pad_0; + const int pad_1; + + std::string vars() override { + return VARS_TO_STR4(type, ne_a, pad_0, pad_1); + } + + test_pad_reflect_1d(ggml_type type = GGML_TYPE_F32, + std::array ne_a = {512, 34, 2, 1}, + int pad_0 = 10, int pad_1 = 9) + : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 2, ne_a.data()); + ggml_set_name(a, "a"); + + ggml_tensor * out = ggml_pad_reflect_1d(ctx, a, pad_0, pad_1); + ggml_set_name(out, "out"); + + return out; + } +}; + // GGML_OP_ARANGE struct test_arange : public test_case { const ggml_type type; @@ -3816,6 +3843,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1})); test_cases.emplace_back(new test_acc()); test_cases.emplace_back(new test_pad()); + test_cases.emplace_back(new test_pad_reflect_1d()); test_cases.emplace_back(new test_arange()); test_cases.emplace_back(new test_timestep_embedding()); test_cases.emplace_back(new test_leaky_relu()); diff --git a/tests/test-pad-reflect-1d.cpp b/tests/test-pad-reflect-1d.cpp new file mode 100644 index 000000000..764c3e5b8 --- /dev/null +++ b/tests/test-pad-reflect-1d.cpp @@ -0,0 +1,211 @@ +#include "ggml.h" +#include "ggml-cpu.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" + +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + +#include +#include +#include + + +static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + fputs(text, stderr); + fflush(stderr); +} + +struct ggml_context* make_ctx(void) { + struct ggml_init_params params = { + .mem_size = 2 * 1024 * 1024, + }; + return ggml_init(params); +} + +void check_tensor(struct ggml_tensor * t, float * expected_t_d, int ne0, int ne1, int ne2) { + GGML_ASSERT(t->type == GGML_TYPE_F32); + GGML_ASSERT(t->ne[0] == ne0); + GGML_ASSERT(t->ne[1] == ne1); + GGML_ASSERT(t->ne[2] == ne2); + for (int i2 = 0; i2 < ne2; ++i2) { + for (int i1 = 0; i1 < ne1; ++i1) { + for (int i0 = 0; i0 < ne0; ++i0) { + float expected = *(expected_t_d + i2 * ne1 * ne0 + i1 * ne0 + i0); + float actual = ggml_get_data_f32(t)[i2 * ne1 * ne0 + i1 * ne0 + i0]; + if (expected != actual) { + printf("expected %.1f, got %.1f at (%d,%d,%d)\n", expected, actual, i0, i1, i2); + } + GGML_ASSERT(expected == actual); + } + } + } +} + +void test_pad_reflect_1d(bool use_gpu) { + ggml_backend_t backend = NULL; + struct ggml_init_params params; + ggml_backend_buffer_t buffer; + struct ggml_context * ctx; + struct ggml_tallocr tallocr; + ggml_gallocr_t gallocr; + + // initialize the backend +#ifdef GGML_USE_CUDA + if (use_gpu) { + fprintf(stderr, "%s: using CUDA backend\n", __func__); + backend = ggml_backend_cuda_init(0); + if (!backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } + } +#endif + +#ifdef GGML_USE_METAL + if (use_gpu) { + fprintf(stderr, "%s: using Metal backend\n", __func__); + backend = ggml_backend_metal_init(); + if (!backend) { + fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); + } + } +#endif + + if (!backend) { + fprintf(stderr, "%s: using CPU backend\n", __func__); + backend = ggml_backend_cpu_init(); + } + + // Test cases for different padding configurations + { + params = (struct ggml_init_params){ + .mem_size = 16*1024*1024, + .mem_buffer = NULL, + .no_alloc = true, + }; + + ggml_log_set(ggml_log_callback_default, nullptr); + + ctx = ggml_init(params); + buffer = ggml_backend_alloc_buffer(backend, 16*1024*1024); + tallocr = ggml_tallocr_new(buffer); + gallocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + + // Create a simple 1D input tensor [1, 2, 3, 4] + struct ggml_tensor * t = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4); + float input_data[] = {1.0f, 2.0f, 3.0f, 4.0f}; + ggml_tallocr_alloc(&tallocr, t); + + // load data to buffer + if(ggml_backend_is_cpu(backend)) { + memcpy(t->data, input_data, ggml_nbytes(t)); + } else { + ggml_backend_tensor_set(t, input_data, 0, ggml_nbytes(t)); + } + + // Test case 1: pad left=1, right=1 + // Expected: [2, 1, 2, 3, 4, 3] + float expected_1[] = {2.0f, 1.0f, 2.0f, 3.0f, 4.0f, 3.0f}; + struct ggml_tensor * out_1 = ggml_pad_reflect_1d(ctx, t, 1, 1); + + // Test case 2: pad left=2, right=1 + // Expected: [3, 2, 1, 2, 3, 4, 3] + float expected_2[] = {3.0f, 2.0f, 1.0f, 2.0f, 3.0f, 4.0f, 3.0f}; + struct ggml_tensor * out_2 = ggml_pad_reflect_1d(ctx, t, 2, 1); + + // Test case 3: pad left=1, right=2 + // Expected: [2, 1, 2, 3, 4, 3, 2] + float expected_3[] = {2.0f, 1.0f, 2.0f, 3.0f, 4.0f, 3.0f, 2.0f}; + struct ggml_tensor * out_3 = ggml_pad_reflect_1d(ctx, t, 1, 2); + + struct ggml_cgraph * gf = ggml_new_graph(ctx); + ggml_build_forward_expand(gf, out_1); + ggml_build_forward_expand(gf, out_2); + ggml_build_forward_expand(gf, out_3); + + ggml_gallocr_alloc_graph(gallocr, gf); + + ggml_backend_graph_compute(backend, gf); + + check_tensor(out_1, expected_1, 6, 1, 1); + check_tensor(out_2, expected_2, 7, 1, 1); + check_tensor(out_3, expected_3, 7, 1, 1); + + ggml_free(ctx); + ggml_backend_buffer_free(buffer); + ggml_gallocr_free(gallocr); + } + + { + params = (struct ggml_init_params){ + .mem_size = 16*1024*1024, + .mem_buffer = NULL, + .no_alloc = true, + }; + + ggml_log_set(ggml_log_callback_default, nullptr); + + ctx = ggml_init(params); + buffer = ggml_backend_alloc_buffer(backend, 16*1024*1024); + tallocr = ggml_tallocr_new(buffer); + gallocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + + // Create a 2D input tensor (5 columns × 4 rows) + struct ggml_tensor * t = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 5, 4); + float input_data[] = { + 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, // row 1 + 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, // row 2 + 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, // row 3 + 16.0f, 17.0f, 18.0f, 19.0f, 20.0f // row 4 + }; + ggml_tallocr_alloc(&tallocr, t); + + // load data to buffer + if(ggml_backend_is_cpu(backend)) { + memcpy(t->data, input_data, ggml_nbytes(t)); + } else { + ggml_backend_tensor_set(t, input_data, 0, ggml_nbytes(t)); + } + + // Test case 4: pad left=3, right=2 on a 2D tensor + // Each row should be padded independently + float expected_4[] = { + 4.0f, 3.0f, 2.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 4.0f, 3.0f, // row 1 + 9.0f, 8.0f, 7.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 9.0f, 8.0f, // row 2 + 14.0f, 13.0f, 12.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 14.0f, 13.0f, // row 3 + 19.0f, 18.0f, 17.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 19.0f, 18.0f // row 4 + }; + struct ggml_tensor * out_4 = ggml_pad_reflect_1d(ctx, t, 3, 2); + + struct ggml_cgraph * gf = ggml_new_graph(ctx); + ggml_build_forward_expand(gf, out_4); + + ggml_gallocr_alloc_graph(gallocr, gf); + + ggml_backend_graph_compute(backend, gf); + + check_tensor(out_4, expected_4, 10, 4, 1); + + ggml_free(ctx); + ggml_gallocr_free(gallocr); + ggml_backend_buffer_free(buffer); + } + + ggml_backend_free(backend); +} + +int main(int argc, const char * argv[]) { + bool use_gpu = false; + if (argc > 1) { + use_gpu = strcmp(argv[1], "--gpu") == 0; + } + test_pad_reflect_1d(use_gpu); + return 0; +}