Vulkan: Implement glu_split logic and shader support

0cc4m · 0cc4m · commit 77be71e679e9 · 2025-06-15T06:12:54.000Z
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -659,6 +659,11 @@ struct vk_op_push_constants {
     float param2;
 };
 
+struct vk_op_glu_push_constants {
+    uint32_t ne00;
+    uint32_t mode;  // 0: default, 1: swapped, 2: split
+};
+
 struct vk_op_unary_push_constants {
     uint32_t ne;
     uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
@@ -2733,8 +2738,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
 #undef CREATE_UNARY
 
 #define CREATE_GLU(name)  \
-    ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);  \
-    ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 3, sizeof(vk_op_glu_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);  \
+    ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 3, sizeof(vk_op_glu_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
 
     CREATE_GLU(geglu)
     CREATE_GLU(reglu)
@@ -6947,7 +6952,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
         }
     }
 
-    if (op == GGML_OP_SOFT_MAX) {
+    if (op == GGML_OP_SOFT_MAX || op == GGML_OP_GLU) {
         // Empty src1 is possible in soft_max, but the shader needs a buffer
         vk_subbuffer subbuf_y;
         if (use_src1) {
@@ -7539,12 +7544,23 @@ static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, con
     ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
 }
 
-static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    GGML_ASSERT(src0->ne[0] / 2 == dst->ne[0]);
+static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    const bool swapped = (bool)dst->op_params[1];
+    const bool split = src1 != nullptr;
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    if (!split) {
+        GGML_ASSERT(src0->ne[0] / 2 == dst->ne[0]);
+    } else {
+        GGML_ASSERT(src0->ne[0] == src1->ne[0]);
+        GGML_ASSERT(src0->ne[0] == dst->ne[0]);
+        GGML_ASSERT(src0->type == src1->type);
+    }
 
-    const uint32_t swapped = (uint32_t)dst->op_params[1];
+    const uint32_t mode = split ? 2 : (swapped ? 1 : 0);
 
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_GLU, { (uint32_t)src0->ne[0], swapped, 0.0f, 0.0f }, dryrun);
+    ggml_vk_op_f32<vk_op_glu_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GLU, { (uint32_t)src0->ne[0], mode }, dryrun);
 }
 
 static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
@@ -9003,7 +9019,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
         case GGML_GLU_OP_GEGLU:
         case GGML_GLU_OP_REGLU:
         case GGML_GLU_OP_SWIGLU:
-            ggml_vk_glu(ctx, compute_ctx, src0, node, dryrun);
+            ggml_vk_glu(ctx, compute_ctx, src0, src1, node, dryrun);
             break;
         default:
             return false;
@@ -10725,7 +10741,11 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
             GGML_ABORT("fatal error");
         }
     } else if (tensor->op == GGML_OP_GLU) {
-        tensor_clone = ggml_glu(ggml_ctx, src_clone[0], (ggml_glu_op) tensor->op_params[0], tensor->op_params[1]);
+        if (src_clone[1] == nullptr) {
+            tensor_clone = ggml_glu(ggml_ctx, src_clone[0], (ggml_glu_op) tensor->op_params[0], tensor->op_params[1]);
+        } else {
+            tensor_clone = ggml_glu_split(ggml_ctx, src_clone[0], src_clone[1], (ggml_glu_op) tensor->op_params[0]);
+        }
     } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
         if (src1 == nullptr) {
             tensor_clone = ggml_dup(ggml_ctx, src_clone[0]);
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp
@@ -1,43 +1,13 @@
 #version 450
 
-#include "generic_head.comp"
-#include "types.comp"
+#include "glu_head.comp"
 
-#extension GL_EXT_control_flow_attributes : enable
+const float GELU_COEF_A    = 0.044715f;
+const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
 
-layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-layout (constant_id = 0) const uint BLOCK_SIZE = 32;
-
-void main() {
-    const float GELU_COEF_A    = 0.044715f;
-    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
-
-    const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
-    const uint col = gl_LocalInvocationID.x;
-
-    const uint offset = p.KX / 2;
-
-    const bool swapped = p.KY > 0;
-
-    if (!swapped) {
-        for (uint i = col; i < offset; i += BLOCK_SIZE) {
-            const uint idx = row * p.KX + i;
-
-            const float xi = float(data_a[idx]);
-            const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi);
-            data_d[row * offset + i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1)) * float(data_a[idx + offset]));
-        }
-    } else {
-        for (uint i = col; i < offset; i += BLOCK_SIZE) {
-            const uint idx = row * p.KX + i;
-
-            const float xi = float(data_a[idx + offset]);
-            const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi);
-            data_d[row * offset + i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1)) * float(data_a[idx]));
-        }
-    }
+float op(float a, float b) {
+    const float val = SQRT_2_OVER_PI*a*(1.0f + GELU_COEF_A*a*a);
+    return 0.5f*a*(2.0f - 2.0f / (exp(2 * val) + 1)) * b;
 }
+
+#include "glu_main.comp"
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp
@@ -0,0 +1,15 @@
+#extension GL_EXT_shader_16bit_storage : require
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) readonly buffer B {A_TYPE data_b[];};
+layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
+
+layout (constant_id = 0) const uint BLOCK_SIZE = 32;
+
+layout (push_constant) uniform parameter
+{
+    uint ne00;
+    uint mode;
+} p;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp b/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp
@@ -0,0 +1,31 @@
+void main() {
+    const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
+    const uint col = gl_LocalInvocationID.x;
+
+    if (p.mode == 0) {
+        // Default
+        const uint offset = p.ne00 / 2;
+
+        for (uint i = col; i < offset; i += BLOCK_SIZE) {
+            const uint idx = row * p.ne00 + i;
+
+            data_d[row * offset + i] = D_TYPE(op(float(data_a[idx]), float(data_a[idx + offset])));
+        }
+    } else if (p.mode == 1) {
+        // Swapped
+        const uint offset = p.ne00 / 2;
+
+        for (uint i = col; i < offset; i += BLOCK_SIZE) {
+            const uint idx = row * p.ne00 + i;
+
+            data_d[row * offset + i] = D_TYPE(op(float(data_a[idx + offset]), float(data_a[idx])));
+        }
+    } else {
+        // Split
+        for (uint i = col; i < p.ne00; i += BLOCK_SIZE) {
+            const uint idx = row * p.ne00 + i;
+
+            data_d[idx] = D_TYPE(op(float(data_a[idx]), float(data_b[idx])));
+        }
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp
@@ -1,36 +1,9 @@
 #version 450
 
-#include "generic_head.comp"
-#include "types.comp"
+#include "glu_head.comp"
 
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-layout (constant_id = 0) const uint BLOCK_SIZE = 32;
-
-void main() {
-    const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
-    const uint col = gl_LocalInvocationID.x;
-
-    const uint offset = p.KX / 2;
-
-    const bool swapped = p.KY > 0;
-
-    if (!swapped) {
-        for (uint i = col; i < offset; i += BLOCK_SIZE) {
-            const uint idx = row * p.KX + i;
-
-            data_d[row * offset + i] = D_TYPE(max(float(data_a[idx]), 0.0f) * float(data_a[idx + offset]));
-        }
-    } else {
-        for (uint i = col; i < offset; i += BLOCK_SIZE) {
-            const uint idx = row * p.KX + i;
-
-            data_d[row * offset + i] = D_TYPE(max(float(data_a[idx + offset]), 0.0f) * float(data_a[idx]));
-        }
-    }
+float op(float a, float b) {
+    return max(a, 0.0f) * b;
 }
+
+#include "glu_main.comp"
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp
@@ -1,38 +1,9 @@
 #version 450
 
-#include "generic_head.comp"
-#include "types.comp"
+#include "glu_head.comp"
 
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-layout (constant_id = 0) const uint BLOCK_SIZE = 32;
-
-void main() {
-    const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
-    const uint col = gl_LocalInvocationID.x;
-
-    const uint offset = p.KX / 2;
-
-    const bool swapped = p.KY > 0;
-
-    if (!swapped) {
-        for (uint i = col; i < offset; i += BLOCK_SIZE) {
-            const uint idx = row * p.KX + i;
-
-            const float xi = float(data_a[idx]);
-            data_d[row * offset + i] = D_TYPE(xi / (1.0f + exp(-xi)) * float(data_a[idx + offset]));
-        }
-    } else {
-        for (uint i = col; i < offset; i += BLOCK_SIZE) {
-            const uint idx = row * p.KX + i;
-
-            const float xi = float(data_a[idx + offset]);
-            data_d[row * offset + i] = D_TYPE(xi / (1.0f + exp(-xi)) * float(data_a[idx]));
-        }
-    }
+float op(float a, float b) {
+    return a / (1.0f + exp(-a)) * b;
 }
+
+#include "glu_main.comp"