Here're some resources about Position Embedding in Transformers
tag: Llama2 Long
paper link: here
citation:
@misc{xiong2023effective,
title={Effective Long-Context Scaling of Foundation Models},
author={Wenhan Xiong and Jingyu Liu and Igor Molybog and Hejia Zhang and Prajjwal Bhargava and Rui Hou and Louis Martin and Rashi Rungta and Karthik Abinav Sankararaman and Barlas Oguz and Madian Khabsa and Han Fang and Yashar Mehdad and Sharan Narang and Kshitiz Malik and Angela Fan and Shruti Bhosale and Sergey Edunov and Mike Lewis and Sinong Wang and Hao Ma},
year={2023},
eprint={2309.16039},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
CoCA: Fusing Position Embedding with Collinear Constrained Attention in Transformers for Long Context Window Extending
tag: CoCA
paper link: here
github link: here
citation:
@misc{zhu2024cocafusingpositionembedding,
title={CoCA: Fusing Position Embedding with Collinear Constrained Attention in Transformers for Long Context Window Extending},
author={Shiyi Zhu and Jing Ye and Wei Jiang and Siqiao Xue and Qi Zhang and Yifan Wu and Jianguo Li},
year={2024},
eprint={2309.08646},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2309.08646},
}
tag: ReRoPE
overview:
blog link: here
citation:
@misc{transformer-upgrade-12,
author = "Su, Jianlin",
title = "Transformer Upgrade Roadmap: 12. ReRoPE for Infinite Extrapolation?",
year = "2023",
month = "Aug",
howpublished = "\url{https://spaces.ac.cn/archives/9708}"
}
tag: YaRN
| NTK-by-parts
paper link: here
github link: here
citation:
@misc{peng2023yarnefficientcontextwindow,
title={YaRN: Efficient Context Window Extension of Large Language Models},
author={Bowen Peng and Jeffrey Quesnelle and Honglu Fan and Enrico Shippole},
year={2023},
eprint={2309.00071},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2309.00071},
}
tag: Giraffe
| Power-Scaling
overview:
paper link: here
citation:
@article{pal2023giraffe,
title={Giraffe: Adventures in expanding context lengths in llms},
author={Pal, Arka and Karkhanis, Deep and Roberts, Manley and Dooley, Samuel and Sundararajan, Arvind and Naidu, Siddartha},
journal={arXiv preprint arXiv:2308.10882},
year={2023}
}
NTK-Aware Scaled RoPE allows LLaMA models to have extended (8k+) context size without any fine-tuning and minimal perplexity degradation
tag: NTK-aware RoPE
| NTK-aware Scaled RoPE
| Dynamic NTK-aware RoPE
| NTK-mixed RoPE
overview:
blog link: NTK-Aware Scaled RoPE | Dynamic NTK-aware RoPE | NTK-mixed RoPE
citation:
@misc{ntk-aware-rope,
author = "bloc97",
title = "NTK-Aware Scaled RoPE allows LLaMA models to have extended (8k+) context size without any fine-tuning and minimal perplexity degradation",
year = "2023",
month = "Jun",
howpublished = "\url{https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware\\_scaled\\_rope\\_allows\\_llama\\_models\\_to\\_have/}"
}
tag: PI
overview:
paper link: here
citation:
@article{chen2023extending,
title={Extending context window of large language models via positional interpolation},
author={Chen, Shouyuan and Wong, Sherman and Chen, Liangjian and Tian, Yuandong},
journal={arXiv preprint arXiv:2306.15595},
year={2023}
}
tag: Random Padding PE
paper link: here
citation:
@article{tao2023frustratingly,
title={A Frustratingly Easy Improvement for Position Embeddings via Random Padding},
author={Tao, Mingxu and Feng, Yansong and Zhao, Dongyan},
journal={arXiv preprint arXiv:2305.04859},
year={2023}
}
tag: Randomized PE
paper link: here
citation:
@article{ruoss2023randomized,
title={Randomized Positional Encodings Boost Length Generalization of Transformers},
author={Ruoss, Anian and Del{\'e}tang, Gr{\'e}goire and Genewein, Tim and Grau-Moya, Jordi and Csord{\'a}s, R{\'o}bert and Bennani, Mehdi and Legg, Shane and Veness, Joel},
journal={arXiv preprint arXiv:2305.16843},
year={2023}
}
tag: LEX
| XPOS
overview:
paper link: here
citation:
@article{sun2022length,
title={A length-extrapolatable transformer},
author={Sun, Yutao and Dong, Li and Patra, Barun and Ma, Shuming and Huang, Shaohan and Benhaim, Alon and Chaudhary, Vishrav and Song, Xia and Wei, Furu},
journal={arXiv preprint arXiv:2212.10554},
year={2022}
}
tag: SHAPE
paper link: here
citation:
@article{kiyono2021shape,
title={SHAPE: Shifted absolute position embedding for transformers},
author={Kiyono, Shun and Kobayashi, Sosuke and Suzuki, Jun and Inui, Kentaro},
journal={arXiv preprint arXiv:2109.05644},
year={2021}
}
tag: Permuteformer
paper link: here
citation:
@article{chen2021permuteformer,
title={Permuteformer: Efficient relative position encoding for long sequences},
author={Chen, Peng},
journal={arXiv preprint arXiv:2109.02377},
year={2021}
}
tag: RoPE
| Rotary PE
| RoFormer
overview:
paper link: here
blog link: here
citation:
@misc{su2023roformerenhancedtransformerrotary,
title={RoFormer: Enhanced Transformer with Rotary Position Embedding},
author={Jianlin Su and Yu Lu and Shengfeng Pan and Ahmed Murtadha and Bo Wen and Yunfeng Liu},
year={2023},
eprint={2104.09864},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2104.09864},
}
tag: SinPE
| Sinusoidal PE
| NIPS17
| Google
overview:
paper link: here
citation:
@article{vaswani2017attention,
title={Attention is all you need},
author={Vaswani, A},
journal={Advances in Neural Information Processing Systems},
year={2017}
}