Here're some resources about Local Attention
Examples of Local Attention Patterns
The traditional attention mechanism is characterized by its global and full attention nature, wherein every token is expected to attend to every other token, resulting in quadratic time and space complexities.
Considering the significance of local context in certain applications, various approaches have been introduced to implement local attention mechanisms in recent years. These mechanisms restrict each token’s attention to its neighboring tokens instead of all tokens, and the variations among these approaches arise from the heuristic criteria to determine who qualifies as a token’s neighbor, as depicted in the picture above.
paper link: here
citation:
@article{chen2023longlora,
title={Longlora: Efficient fine-tuning of long-context large language models},
author={Chen, Yukang and Qian, Shengju and Tang, Haotian and Lai, Xin and Liu, Zhijian and Han, Song and Jia, Jiaya},
journal={arXiv preprint arXiv:2309.12307},
year={2023}
}
paper link: here
citation:
@article{mohtashami2023landmark,
title={Landmark Attention: Random-Access Infinite Context Length for Transformers},
author={Mohtashami, Amirkeivan and Jaggi, Martin},
journal={arXiv preprint arXiv:2305.16300},
year={2023}
}
paper link: here
citation:
@article{zuo2022efficient,
title={Efficient long sequence modeling via state space augmented transformer},
author={Zuo, Simiao and Liu, Xiaodong and Jiao, Jian and Charles, Denis and Manavoglu, Eren and Zhao, Tuo and Gao, Jianfeng},
journal={arXiv preprint arXiv:2212.08136},
year={2022}
}
paper link: here
citation:
@misc{ma2023mega,
title={Mega: Moving Average Equipped Gated Attention},
author={Xuezhe Ma and Chunting Zhou and Xiang Kong and Junxian He and Liangke Gui and Graham Neubig and Jonathan May and Luke Zettlemoyer},
year={2023},
eprint={2209.10655},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
paper link: here
citation:
@inproceedings{tay2020sparse,
title={Sparse sinkhorn attention},
author={Tay, Yi and Bahri, Dara and Yang, Liu and Metzler, Donald and Juan, Da-Cheng},
booktitle={International Conference on Machine Learning},
pages={9438--9447},
year={2020},
organization={PMLR}
}
paper link: here
citation:
@article{qiu2019blockwise,
title={Blockwise self-attention for long document understanding},
author={Qiu, Jiezhong and Ma, Hao and Levy, Omer and Yih, Scott Wen-tau and Wang, Sinong and Tang, Jie},
journal={arXiv preprint arXiv:1911.02972},
year={2019}
}
paper link: here
citation:
@article{shen2018bi,
title={Bi-directional block self-attention for fast and memory-efficient sequence modeling},
author={Shen, Tao and Zhou, Tianyi and Long, Guodong and Jiang, Jing and Zhang, Chengqi},
journal={arXiv preprint arXiv:1804.00857},
year={2018}
}
paper link: here
@article{baykal2024alternating,
title={Alternating updates for efficient transformers},
author={Baykal, Cenk and Cutler, Dylan and Dikkala, Nishanth and Ghosh, Nikhil and Panigrahy, Rina and Wang, Xin},
journal={Advances in Neural Information Processing Systems},
volume={36},
year={2024}
}
paper link: here
@article{dai2020funnel,
title={Funnel-transformer: Filtering out sequential redundancy for efficient language processing},
author={Dai, Zihang and Lai, Guokun and Yang, Yiming and Le, Quoc},
journal={Advances in neural information processing systems},
volume={33},
pages={4271--4282},
year={2020}
}
paper link: here
citation:
@article{beltagy2020longformer,
title={Longformer: The long-document transformer},
author={Beltagy, Iz and Peters, Matthew E and Cohan, Arman},
journal={arXiv preprint arXiv:2004.05150},
year={2020}
}
paper link: here
citation:
@misc{jin2024llm,
title={LLM Maybe LongLM: Self-Extend LLM Context Window Without Tuning},
author={Hongye Jin and Xiaotian Han and Jingfeng Yang and Zhimeng Jiang and Zirui Liu and Chia-Yuan Chang and Huiyuan Chen and Xia Hu},
year={2024},
eprint={2401.01325},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
paper link: here
citation:
@article{han2023lm,
title={Lm-infinite: Simple on-the-fly length generalization for large language models},
author={Han, Chi and Wang, Qifan and Xiong, Wenhan and Chen, Yu and Ji, Heng and Wang, Sinong},
journal={arXiv preprint arXiv:2308.16137},
year={2023}
}
paper link: here
citation:
@article{xiao2023efficient,
title={Efficient streaming language models with attention sinks},
author={Xiao, Guangxuan and Tian, Yuandong and Chen, Beidi and Han, Song and Lewis, Mike},
journal={arXiv preprint arXiv:2309.17453},
year={2023}
}
paper link: here
citation:
@article{guo2021longt5,
title={LongT5: Efficient text-to-text transformer for long sequences},
author={Guo, Mandy and Ainslie, Joshua and Uthus, David and Ontanon, Santiago and Ni, Jianmo and Sung, Yun-Hsuan and Yang, Yinfei},
journal={arXiv preprint arXiv:2112.07916},
year={2021}
}
paper link: here
citation:
@article{ainslie2020etc,
title={ETC: Encoding long and structured inputs in transformers},
author={Ainslie, Joshua and Ontanon, Santiago and Alberti, Chris and Cvicek, Vaclav and Fisher, Zachary and Pham, Philip and Ravula, Anirudh and Sanghai, Sumit and Wang, Qifan and Yang, Li},
journal={arXiv preprint arXiv:2004.08483},
year={2020}
}
paper link: here
citation:
@article{kitaev2020reformer,
title={Reformer: The efficient transformer},
author={Kitaev, Nikita and Kaiser, {\L}ukasz and Levskaya, Anselm},
journal={arXiv preprint arXiv:2001.04451},
year={2020}
}