Skip to content

Latest commit

 

History

History
173 lines (124 loc) · 4.31 KB

act_func.md

File metadata and controls

173 lines (124 loc) · 4.31 KB

Activation Function

Here're some resources about activation functions in Transformers

Primer: Searching for Efficient Transformers for Language Modeling

tag: Primer | Squared ReLU | NIPS21 | Google

paper link: here

github link: here

citation:

@misc{so2022primer,
      title={Primer: Searching for Efficient Transformers for Language Modeling}, 
      author={David R. So and Wojciech Mańke and Hanxiao Liu and Zihang Dai and Noam Shazeer and Quoc V. Le},
      year={2022},
      eprint={2109.08668},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

GLU Variants Improve Transformer

tag: GLU | SwiGLU | REGLU | GEGLU | Google

paper link: here

github link: here

citation:

@misc{shazeer2020gluvariantsimprovetransformer,
      title={GLU Variants Improve Transformer}, 
      author={Noam Shazeer},
      year={2020},
      eprint={2002.05202},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2002.05202}, 
}

Deep Learning using Rectified Linear Units (ReLU)

tag: ReLU

paper link: here

github link: here

citation:

@misc{agarap2019deep,
      title={Deep Learning using Rectified Linear Units (ReLU)}, 
      author={Abien Fred Agarap},
      year={2019},
      eprint={1803.08375},
      archivePrefix={arXiv},
      primaryClass={cs.NE}
}

Searching for Activation Functions

tag: Swish | SiLU | Google Brain

paper link: here

citation:

@misc{ramachandran2017searchingactivationfunctions,
      title={Searching for Activation Functions}, 
      author={Prajit Ramachandran and Barret Zoph and Quoc V. Le},
      year={2017},
      eprint={1710.05941},
      archivePrefix={arXiv},
      primaryClass={cs.NE},
      url={https://arxiv.org/abs/1710.05941}, 
}

Gaussian Error Linear Units (GELUs)

tag: GELU | UCB

paper link: here

github link: here

citation:

@misc{hendrycks2023gaussianerrorlinearunits,
      title={Gaussian Error Linear Units (GELUs)}, 
      author={Dan Hendrycks and Kevin Gimpel},
      year={2023},
      eprint={1606.08415},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/1606.08415}, 
}

Deep Learning with S-shaped Rectified Linear Activation Units

tag: SReLU | AAAI16

paper link: here

citation:

@misc{jin2015deeplearningsshapedrectified,
      title={Deep Learning with S-shaped Rectified Linear Activation Units}, 
      author={Xiaojie Jin and Chunyan Xu and Jiashi Feng and Yunchao Wei and Junjun Xiong and Shuicheng Yan},
      year={2015},
      eprint={1512.07030},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
      url={https://arxiv.org/abs/1512.07030}, 
}

Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)

tag: ELU | ICLR16

paper link: here

citation:

@misc{clevert2016fastaccuratedeepnetwork,
      title={Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)}, 
      author={Djork-Arné Clevert and Thomas Unterthiner and Sepp Hochreiter},
      year={2016},
      eprint={1511.07289},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/1511.07289}, 
}

Empirical Evaluation of Rectified Activations in Convolutional Network

tag: ReLU | Leaky ReLU | PReLU | RReLU | ICML16

paper link: here

citation:

@misc{xu2015empiricalevaluationrectifiedactivations,
      title={Empirical Evaluation of Rectified Activations in Convolutional Network}, 
      author={Bing Xu and Naiyan Wang and Tianqi Chen and Mu Li},
      year={2015},
      eprint={1505.00853},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/1505.00853}, 
}