diff --git a/src/llmcompressor/modifiers/awq/mappings.py b/src/llmcompressor/modifiers/awq/mappings.py index 6390445c8..1fae42e0d 100644 --- a/src/llmcompressor/modifiers/awq/mappings.py +++ b/src/llmcompressor/modifiers/awq/mappings.py @@ -74,7 +74,54 @@ class AWQMapping: ), ] +# Gemma includes a pre_feedforward_layernorm in between +# post_attention_layernorm and the mlp down/gate proj layers +# use that instead of post_attention_layernorm in 3rd mapping: +_gemma_mappings = [ + AWQMapping( + "re:.*input_layernorm$", + ["re:.*q_proj$", "re:.*k_proj$", "re:.*v_proj$"], + ), + AWQMapping("re:.*v_proj$", ["re:.*o_proj$"]), + AWQMapping( + "re:.*pre_feedforward_layernorm$", + ["re:.*gate_proj$", "re:.*up_proj$"], + ), + AWQMapping( + "re:.*up_proj$", + ["re:.*down_proj$"], + ), +] + + +# Cohere architecture is similar to default, with a very fundamental difference. +# The MLP block is executed in parallel to the attention. So the tensor goes +# through input_layernorm and then from there it goes directly to the attention +# module and to the MLP module. +_cohere_mappings = [ + AWQMapping( + "re:.*input_layernorm$", + [ + "re:.*self_attn.q_proj$", + "re:.*self_attn.k_proj$", + "re:.*self_attn.v_proj$", + "re:.*mlp.gate_proj$", + "re:.*mlp.up_proj$", + ], + ), + AWQMapping("re:.*v_proj$", ["re:.*o_proj$"]), + AWQMapping( + "re:.*up_proj$", + ["re:.*down_proj$"], + ), +] + AWQ_MAPPING_REGISTRY: Dict[str, list[AWQMapping]] = { + "CohereForCausalLM": _cohere_mappings, + "Cohere2ForCausalLM": _cohere_mappings, + "Gemma2ForCausalLM": _gemma_mappings, + "Gemma3ForCausalLM": _gemma_mappings, + "Gemma3ForConditionalGeneration": _gemma_mappings, "LlamaForCausalLM": _default_mappings, "MistralForCausalLM": _default_mappings, "Phi3ForCausalLM": _phi_mappings,