Here're some resources about activation functions in Transformers
tag: Primer
| Squared ReLU
| NIPS21
| Google
paper link: here
github link: here
citation:
@misc{so2022primer,
title={Primer: Searching for Efficient Transformers for Language Modeling},
author={David R. So and Wojciech Mańke and Hanxiao Liu and Zihang Dai and Noam Shazeer and Quoc V. Le},
year={2022},
eprint={2109.08668},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: GLU
| SwiGLU
| REGLU
| GEGLU
| Google
paper link: here
github link: here
citation:
@misc{shazeer2020gluvariantsimprovetransformer,
title={GLU Variants Improve Transformer},
author={Noam Shazeer},
year={2020},
eprint={2002.05202},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2002.05202},
}
tag: ReLU
paper link: here
github link: here
citation:
@misc{agarap2019deep,
title={Deep Learning using Rectified Linear Units (ReLU)},
author={Abien Fred Agarap},
year={2019},
eprint={1803.08375},
archivePrefix={arXiv},
primaryClass={cs.NE}
}
tag: Swish
| SiLU
| Google Brain
paper link: here
citation:
@misc{ramachandran2017searchingactivationfunctions,
title={Searching for Activation Functions},
author={Prajit Ramachandran and Barret Zoph and Quoc V. Le},
year={2017},
eprint={1710.05941},
archivePrefix={arXiv},
primaryClass={cs.NE},
url={https://arxiv.org/abs/1710.05941},
}
tag: GELU
| UCB
paper link: here
github link: here
citation:
@misc{hendrycks2023gaussianerrorlinearunits,
title={Gaussian Error Linear Units (GELUs)},
author={Dan Hendrycks and Kevin Gimpel},
year={2023},
eprint={1606.08415},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/1606.08415},
}
tag: SReLU
| AAAI16
paper link: here
citation:
@misc{jin2015deeplearningsshapedrectified,
title={Deep Learning with S-shaped Rectified Linear Activation Units},
author={Xiaojie Jin and Chunyan Xu and Jiashi Feng and Yunchao Wei and Junjun Xiong and Shuicheng Yan},
year={2015},
eprint={1512.07030},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/1512.07030},
}
tag: ELU
| ICLR16
paper link: here
citation:
@misc{clevert2016fastaccuratedeepnetwork,
title={Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)},
author={Djork-Arné Clevert and Thomas Unterthiner and Sepp Hochreiter},
year={2016},
eprint={1511.07289},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/1511.07289},
}
tag: ReLU
| Leaky ReLU
| PReLU
| RReLU
| ICML16
paper link: here
citation:
@misc{xu2015empiricalevaluationrectifiedactivations,
title={Empirical Evaluation of Rectified Activations in Convolutional Network},
author={Bing Xu and Naiyan Wang and Tianqi Chen and Mu Li},
year={2015},
eprint={1505.00853},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/1505.00853},
}