Here're some resources about Miscellaneous Architectures for language/sequence modeling
tag: TokenFormer
| Pattention
| Google
| Peking University
paper link: here
github link: here
citation:
@misc{wang2024tokenformerrethinkingtransformerscaling,
title={TokenFormer: Rethinking Transformer Scaling with Tokenized Model Parameters},
author={Haiyang Wang and Yue Fan and Muhammad Ferjad Naeem and Yongqin Xian and Jan Eric Lenssen and Liwei Wang and Federico Tombari and Bernt Schiele},
year={2024},
eprint={2410.23168},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2410.23168},
}
tag: LoLCATs
| Attention Transfer
| LoRA
| Together AI
| Standford University
paper link: here
github link: here
citation:
@misc{zhang2024lolcatslowranklinearizinglarge,
title={LoLCATs: On Low-Rank Linearizing of Large Language Models},
author={Michael Zhang and Simran Arora and Rahul Chalamala and Alan Wu and Benjamin Spector and Aaryan Singhal and Krithik Ramesh and Christopher Ré},
year={2024},
eprint={2410.10254},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2410.10254},
}
tag: XNet
| KAN
paper link: here
citation:
@misc{li2024modelcomparisonsxnetoutperforms,
title={Model Comparisons: XNet Outperforms KAN},
author={Xin Li and Zhihong Jeff Xia and Xiaotao Zheng},
year={2024},
eprint={2410.02033},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2410.02033},
}
tag: L-Mul
| BitEnergy AI
paper link: here
citation:
@misc{luo2024additionneedenergyefficientlanguage,
title={Addition is All You Need for Energy-efficient Language Models},
author={Hongyin Luo and Wei Sun},
year={2024},
eprint={2410.00907},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2410.00907},
}
tag: XNet
| Cauchy Activation Function
| Cauchy Integral Theorem
paper link: here
follow-up work: here
citation:
@misc{li2024cauchyactivationfunctionxnet,
title={Cauchy activation function and XNet},
author={Xin Li and Zhihong Xia and Hongkun Zhang},
year={2024},
eprint={2409.19221},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2409.19221},
}
tag: Matmul-free LM
| UCSC
paper link: here
github link: here
citation:
@misc{zhu2024scalablematmulfreelanguagemodeling,
title={Scalable MatMul-free Language Modeling},
author={Rui-Jie Zhu and Yu Zhang and Ethan Sifferman and Tyler Sheaves and Yiqiao Wang and Dustin Richmond and Peng Zhou and Jason K. Eshraghian},
year={2024},
eprint={2406.02528},
archivePrefix={arXiv},
primaryClass={cs.CL}
url={https://arxiv.org/abs/2406.02528},
}
tag: YOCO
| Microsoft
| Tsinghua University
paper link: here
github link: here
citation:
@misc{sun2024cacheoncedecoderdecoderarchitectures,
title={You Only Cache Once: Decoder-Decoder Architectures for Language Models},
author={Yutao Sun and Li Dong and Yi Zhu and Shaohan Huang and Wenhui Wang and Shuming Ma and Quanlu Zhang and Jianyong Wang and Furu Wei},
year={2024},
eprint={2405.05254},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2405.05254},
}
tag: M2
| Monarch Mixer
| Standford University
paper link: here
citation:
@article{fu2023monarch,
title={Monarch Mixer: A simple sub-quadratic GEMM-based architecture},
author={Fu, Daniel Y and Arora, Simran and Grogan, Jessica and Johnson, Isys and Eyuboglu, Sabri and Thomas, Armin W and Spector, Benjamin and Poli, Michael and Rudra, Atri and R{\'e}, Christopher},
journal={arXiv preprint arXiv:2310.12109},
year={2023}
}
tag: MogaNet
| ICLR24
| Zhejiang University
paper link: here
github link: here
citation:
@misc{li2024moganetmultiordergatedaggregation,
title={MogaNet: Multi-order Gated Aggregation Network},
author={Siyuan Li and Zedong Wang and Zicheng Liu and Cheng Tan and Haitao Lin and Di Wu and Zhiyuan Chen and Jiangbin Zheng and Stan Z. Li},
year={2024},
eprint={2211.03295},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2211.03295},
}
tag: AFT
| Apple
paper link: here
citation:
@misc{zhai2021attentionfreetransformer,
title={An Attention Free Transformer},
author={Shuangfei Zhai and Walter Talbott and Nitish Srivastava and Chen Huang and Hanlin Goh and Ruixiang Zhang and Josh Susskind},
year={2021},
eprint={2105.14103},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2105.14103},
}
tag: RevNet
| NIPS17
paper link: here
github link: here
citation:
@article{gomez2017reversible,
title={The reversible residual network: Backpropagation without storing activations},
author={Gomez, Aidan N and Ren, Mengye and Urtasun, Raquel and Grosse, Roger B},
journal={Advances in neural information processing systems},
volume={30},
year={2017}
}