Here're some resources about Internal MemoryCache
Recalling the temporality of natural language representations instead of the success of full parallelism in Transformer, we introduce the concept of Internal MemoryCache based on recurrence mechanisms. It divides long text into a stream of fixed-length segments and enhances the query
To facilitate later equations, we assume that each segment has the same length
- Intro
- Segment-Level Recurrence
- Retrospective Recurrence
- Continuous-Signal Memory
- Alternate Cache Designs
paper link: here
citation:
@inproceedings{bai2021segatron,
title={Segatron: Segment-aware transformer for language modeling and understanding},
author={Bai, He and Shi, Peng and Lin, Jimmy and Xie, Yuqing and Tan, Luchen and Xiong, Kun and Gao, Wen and Li, Ming},
booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
volume={35},
number={14},
pages={12526--12534},
year={2021}
}
paper link: here
citation:
@article{rae2019compressive,
title={Compressive transformers for long-range sequence modelling},
author={Rae, Jack W and Potapenko, Anna and Jayakumar, Siddhant M and Lillicrap, Timothy P},
journal={arXiv preprint arXiv:1911.05507},
year={2019}
}
paper link: here
citation:
@article{dai2019transformer,
title={Transformer-xl: Attentive language models beyond a fixed-length context},
author={Dai, Zihang and Yang, Zhilin and Yang, Yiming and Carbonell, Jaime and Le, Quoc V and Salakhutdinov, Ruslan},
journal={arXiv preprint arXiv:1901.02860},
year={2019}
}
paper link: here
citation:
@article{zemlyanskiy2021readtwice,
title={Readtwice: Reading very large documents with memories},
author={Zemlyanskiy, Yury and Ainslie, Joshua and de Jong, Michiel and Pham, Philip and Eckstein, Ilya and Sha, Fei},
journal={arXiv preprint arXiv:2105.04241},
year={2021}
}
paper link: here
citation:
@article{fan2020addressing,
title={Addressing some limitations of transformers with feedback memory},
author={Fan, Angela and Lavril, Thibaut and Grave, Edouard and Joulin, Armand and Sukhbaatar, Sainbayar},
journal={arXiv preprint arXiv:2002.09402},
year={2020}
}
paper link: here
citation:
@article{ding2020ernie,
title={ERNIE-Doc: A retrospective long-document modeling transformer},
author={Ding, Siyu and Shang, Junyuan and Wang, Shuohuan and Sun, Yu and Tian, Hao and Wu, Hua and Wang, Haifeng},
journal={arXiv preprint arXiv:2012.15688},
year={2020}
}
paper link: here
citation:
@article{martins2021infty,
title={$$\backslash$infty $-former: Infinite Memory Transformer},
author={Martins, Pedro Henrique and Marinho, Zita and Martins, Andr{\'e} FT},
journal={arXiv preprint arXiv:2109.00301},
year={2021}
}
paper link: here
citation:
@article{bulatov2023scaling,
title={Scaling Transformer to 1M tokens and beyond with RMT},
author={Bulatov, Aydar and Kuratov, Yuri and Burtsev, Mikhail S},
journal={arXiv preprint arXiv:2304.11062},
year={2023}
}
paper link: here
citation:
@article{wu2022memorizing,
title={Memorizing transformers},
author={Wu, Yuhuai and Rabe, Markus N and Hutchins, DeLesley and Szegedy, Christian},
journal={arXiv preprint arXiv:2203.08913},
year={2022}
}
paper link: here
citation:
@article{wu2020memformer,
title={Memformer: A memory-augmented transformer for sequence modeling},
author={Wu, Qingyang and Lan, Zhenzhong and Qian, Kun and Gu, Jing and Geramifard, Alborz and Yu, Zhou},
journal={arXiv preprint arXiv:2010.06891},
year={2020}
}