-
Notifications
You must be signed in to change notification settings - Fork 12.2k
ggml : implement op fusion, starting with REGLU/GEGLU/SWIGLU #14158
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
76c9bc1
5a490f0
56c7993
21c4963
bb2fda7
a1a7b6d
f8c2080
a341aa3
d9ddeb9
cfa9c7a
70e8b48
f8705a2
0b2703f
d593429
34d1aed
a9aedf4
35dacd1
a234e09
ab46d11
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -519,6 +519,8 @@ extern "C" { | |
GGML_OP_CROSS_ENTROPY_LOSS_BACK, | ||
GGML_OP_OPT_STEP_ADAMW, | ||
|
||
GGML_OP_GLU, | ||
|
||
GGML_OP_COUNT, | ||
}; | ||
|
||
|
@@ -542,6 +544,14 @@ extern "C" { | |
GGML_UNARY_OP_COUNT, | ||
}; | ||
|
||
enum ggml_glu_op { | ||
GGML_GLU_OP_REGLU, | ||
GGML_GLU_OP_GEGLU, | ||
GGML_GLU_OP_SWIGLU, | ||
|
||
GGML_GLU_OP_COUNT, | ||
}; | ||
|
||
enum ggml_object_type { | ||
GGML_OBJECT_TYPE_TENSOR, | ||
GGML_OBJECT_TYPE_GRAPH, | ||
|
@@ -657,6 +667,7 @@ extern "C" { | |
GGML_API const char * ggml_op_symbol(enum ggml_op op); | ||
|
||
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op); | ||
GGML_API const char * ggml_glu_op_name(enum ggml_glu_op op); | ||
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name | ||
|
||
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); | ||
|
@@ -758,6 +769,7 @@ extern "C" { | |
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3); | ||
|
||
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); | ||
GGML_API enum ggml_glu_op ggml_get_glu_op(const struct ggml_tensor * tensor); | ||
|
||
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); | ||
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); | ||
|
@@ -1086,6 +1098,63 @@ extern "C" { | |
struct ggml_context * ctx, | ||
struct ggml_tensor * a); | ||
|
||
// gated linear unit ops | ||
// A: n columns, r rows, | ||
// result is n / 2 columns, r rows, | ||
// expects gate in second half of row, unless swapped is true | ||
GGML_API struct ggml_tensor * ggml_glu( | ||
struct ggml_context * ctx, | ||
struct ggml_tensor * a, | ||
enum ggml_glu_op op, | ||
bool swapped); | ||
|
||
GGML_API struct ggml_tensor * ggml_reglu( | ||
struct ggml_context * ctx, | ||
struct ggml_tensor * a); | ||
|
||
GGML_API struct ggml_tensor * ggml_reglu_swapped( | ||
struct ggml_context * ctx, | ||
struct ggml_tensor * a); | ||
|
||
GGML_API struct ggml_tensor * ggml_geglu( | ||
struct ggml_context * ctx, | ||
struct ggml_tensor * a); | ||
|
||
GGML_API struct ggml_tensor * ggml_geglu_swapped( | ||
struct ggml_context * ctx, | ||
struct ggml_tensor * a); | ||
|
||
GGML_API struct ggml_tensor * ggml_swiglu( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just want to note that I have been observing one variants of swiglu. it's used by ultravox, which sigmoid the second half of the vector instead of the first half There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, interesting, worth adding a parameter for, or best just handling in conversion? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it would be nice to have a param since the GGUFs are already on the internet. Haven't thought about permuting the FFN up tensor before, nice suggestion There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added swapped variants. @ggerganov I didn't dare update metal code, so needs to be implemented there too. :) |
||
struct ggml_context * ctx, | ||
struct ggml_tensor * a); | ||
|
||
GGML_API struct ggml_tensor * ggml_swiglu_swapped( | ||
struct ggml_context * ctx, | ||
struct ggml_tensor * a); | ||
|
||
// A: n columns, r rows, | ||
// B: n columns, r rows, | ||
GGML_API struct ggml_tensor * ggml_glu_split( | ||
struct ggml_context * ctx, | ||
struct ggml_tensor * a, | ||
struct ggml_tensor * b, | ||
enum ggml_glu_op op); | ||
|
||
GGML_API struct ggml_tensor * ggml_reglu_split( | ||
struct ggml_context * ctx, | ||
struct ggml_tensor * a, | ||
struct ggml_tensor * b); | ||
|
||
GGML_API struct ggml_tensor * ggml_geglu_split( | ||
struct ggml_context * ctx, | ||
struct ggml_tensor * a, | ||
struct ggml_tensor * b); | ||
|
||
GGML_API struct ggml_tensor * ggml_swiglu_split( | ||
struct ggml_context * ctx, | ||
struct ggml_tensor * a, | ||
struct ggml_tensor * b); | ||
|
||
// normalize along rows | ||
GGML_API struct ggml_tensor * ggml_norm( | ||
struct ggml_context * ctx, | ||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tbh I don't even know why geglu was added in the first place. It doesn't seem to be used by any models. And to make matter worse, the PR where it was added has no useful description: #14074
So I wonder if we actually need to implement it as a kernel. The current kernel use tanh approximation, but in practice, there can be many different approximations for gelu op.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nvm, see: https://github.com/ggml-org/llama.cpp/pull/14014/files#r2146203459
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've seen several, and in fact we already support a few (Gemma, DeepSeekV1, Jina-Bert and T5), it's just that the gate is split (some at conversion because we didn't have the op).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's pretty easy adding different GLU ops (and in CUDA I even reuse the original op), adding GEGLU_ERF if necessary shouldn't be a problem.