biblio.bib

@article{auer2002using,
  title={Using confidence bounds for exploitation-exploration trade-offs},
  author={Auer, Peter},
  journal={Journal of Machine Learning Research},
  volume={3},
  number={Nov},
  pages={397--422},
  year={2002}
}

@inproceedings{li2010contextual,
  title={A contextual-bandit approach to personalized news article recommendation},
  author={Li, Lihong and Chu, Wei and Langford, John and Schapire, Robert E},
  booktitle={Proceedings of the 19th international conference on World wide web},
  pages={661--670},
  year={2010}
}

@inproceedings{agrawal2013thompson,
  title={Thompson sampling for contextual bandits with linear payoffs},
  author={Agrawal, Shipra and Goyal, Navin},
  booktitle={International Conference on Machine Learning},
  pages={127--135},
  year={2013},
  organization={PMLR}
}

@inproceedings{kaufmann2012thompson,
  title={Thompson sampling: An asymptotically optimal finite-time analysis},
  author={Kaufmann, Emilie and Korda, Nathaniel and Munos, R{\'e}mi},
  booktitle={International conference on algorithmic learning theory},
  pages={199--213},
  year={2012},
  organization={Springer}
}

@inproceedings{tijsma2016comparing,
  title={Comparing exploration strategies for q-learning in random stochastic mazes},
  author={Tijsma, Arryon D and Drugan, Madalina M and Wiering, Marco A},
  booktitle={2016 IEEE Symposium Series on Computational Intelligence (SSCI)},
  pages={1--8},
  year={2016},
  organization={IEEE}
}

@book{sutton2018reinforcement,
  title={Reinforcement learning: An introduction},
  author={Sutton, Richard S and Barto, Andrew G},
  year={2018},
  publisher={MIT press}
}

@article{mnih2013playing,
  title={Playing atari with deep reinforcement learning},
  author={Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Graves, Alex and Antonoglou, Ioannis and Wierstra, Daan and Riedmiller, Martin},
  journal={arXiv preprint arXiv:1312.5602},
  year={2013}
}

@article{mnih2015human,
  title={Human-level control through deep reinforcement learning},
  author={Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Rusu, Andrei A and Veness, Joel and Bellemare, Marc G and Graves, Alex and Riedmiller, Martin and Fidjeland, Andreas K and Ostrovski, Georg and others},
  journal={nature},
  volume={518},
  number={7540},
  pages={529--533},
  year={2015},
  publisher={Nature Publishing Group}
}

@article{schaul2015prioritized,
  title={Prioritized experience replay},
  author={Schaul, Tom and Quan, John and Antonoglou, Ioannis and Silver, David},
  journal={arXiv preprint arXiv:1511.05952},
  year={2015}
}

@inproceedings{van2016deep,
  title={Deep reinforcement learning with double q-learning},
  author={Van Hasselt, Hado and Guez, Arthur and Silver, David},
  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
  volume={30},
  number={1},
  year={2016}
}

@article{hasselt2010double,
  title={Double Q-learning},
  author={Hasselt, Hado},
  journal={Advances in neural information processing systems},
  volume={23},
  pages={2613--2621},
  year={2010},
  publisher={Citeseer}
}

@inproceedings{hessel2018rainbow,
  title={Rainbow: Combining improvements in deep reinforcement learning},
  author={Hessel, Matteo and Modayil, Joseph and Van Hasselt, Hado and Schaul, Tom and Ostrovski, Georg and Dabney, Will and Horgan, Dan and Piot, Bilal and Azar, Mohammad and Silver, David},
  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
  volume={32},
  number={1},
  year={2018}
}

@article{hausknecht2015deep,
  title={Deep recurrent q-learning for partially observable mdps},
  author={Hausknecht, Matthew and Stone, Peter},
  journal={arXiv preprint arXiv:1507.06527},
  year={2015}
}

@article{schulman2015high,
  title={High-dimensional continuous control using generalized advantage estimation},
  author={Schulman, John and Moritz, Philipp and Levine, Sergey and Jordan, Michael and Abbeel, Pieter},
  journal={arXiv preprint arXiv:1506.02438},
  year={2015}
}

@inproceedings{sutton2000policy,
  title={Policy gradient methods for reinforcement learning with function approximation},
  author={Sutton, Richard S and McAllester, David A and Singh, Satinder P and Mansour, Yishay},
  booktitle={Advances in neural information processing systems},
  pages={1057--1063},
  year={2000}
}

@inproceedings{mnih2016asynchronous,
  title={Asynchronous methods for deep reinforcement learning},
  author={Mnih, Volodymyr and Badia, Adria Puigdomenech and Mirza, Mehdi and Graves, Alex and Lillicrap, Timothy and Harley, Tim and Silver, David and Kavukcuoglu, Koray},
  booktitle={International conference on machine learning},
  pages={1928--1937},
  year={2016},
  organization={PMLR}
}

@inproceedings{schulman2015trust,
  title={Trust region policy optimization},
  author={Schulman, John and Levine, Sergey and Abbeel, Pieter and Jordan, Michael and Moritz, Philipp},
  booktitle={International conference on machine learning},
  pages={1889--1897},
  year={2015},
  organization={PMLR}
}

@article{bhatnagar2007incremental,
  title={Incremental natural actor-critic algorithms},
  author={Bhatnagar, Shalabh and Ghavamzadeh, Mohammad and Lee, Mark and Sutton, Richard S},
  journal={Advances in neural information processing systems},
  volume={20},
  pages={105--112},
  year={2007}
}

@article{pajarinen2019compatible,
  title={Compatible natural gradient policy search},
  author={Pajarinen, Joni and Thai, Hong Linh and Akrour, Riad and Peters, Jan and Neumann, Gerhard},
  journal={Machine Learning},
  volume={108},
  number={8},
  pages={1443--1466},
  year={2019},
  publisher={Springer}
}

@article{schulman2017proximal,
  title={Proximal policy optimization algorithms},
  author={Schulman, John and Wolski, Filip and Dhariwal, Prafulla and Radford, Alec and Klimov, Oleg},
  journal={arXiv preprint arXiv:1707.06347},
  year={2017}
}

@article{degris2012off,
  title={Off-policy actor-critic},
  author={Degris, Thomas and White, Martha and Sutton, Richard S},
  journal={arXiv preprint arXiv:1205.4839},
  year={2012}
}

@article{precup2000eligibility,
  title={Eligibility traces for off-policy policy evaluation},
  author={Precup, Doina},
  journal={Computer Science Department Faculty Publication Series},
  pages={80},
  year={2000}
}

@article{munos2016safe,
  title={Safe and efficient off-policy reinforcement learning},
  author={Munos, R{\'e}mi and Stepleton, Tom and Harutyunyan, Anna and Bellemare, Marc G},
  journal={arXiv preprint arXiv:1606.02647},
  year={2016}
}

@article{wang2016sample,
  title={Sample efficient actor-critic with experience replay},
  author={Wang, Ziyu and Bapst, Victor and Heess, Nicolas and Mnih, Volodymyr and Munos, Remi and Kavukcuoglu, Koray and de Freitas, Nando},
  journal={arXiv preprint arXiv:1611.01224},
  year={2016}
}

@article{lillicrap2015continuous,
  title={Continuous control with deep reinforcement learning},
  author={Lillicrap, Timothy P and Hunt, Jonathan J and Pritzel, Alexander and Heess, Nicolas and Erez, Tom and Tassa, Yuval and Silver, David and Wierstra, Daan},
  journal={arXiv preprint arXiv:1509.02971},
  year={2015}
}

@inproceedings{silver2014deterministic,
  title={Deterministic policy gradient algorithms},
  author={Silver, David and Lever, Guy and Heess, Nicolas and Degris, Thomas and Wierstra, Daan and Riedmiller, Martin},
  booktitle={International conference on machine learning},
  pages={387--395},
  year={2014},
  organization={PMLR}
}

@inproceedings{fujimoto2018addressing,
  title={Addressing function approximation error in actor-critic methods},
  author={Fujimoto, Scott and Hoof, Herke and Meger, David},
  booktitle={International Conference on Machine Learning},
  pages={1587--1596},
  year={2018},
  organization={PMLR}
}

@article{barth2018distributed,
  title={Distributed distributional deterministic policy gradients},
  author={Barth-Maron, Gabriel and Hoffman, Matthew W and Budden, David and Dabney, Will and Horgan, Dan and Tb, Dhruva and Muldal, Alistair and Heess, Nicolas and Lillicrap, Timothy},
  journal={arXiv preprint arXiv:1804.08617},
  year={2018}
}

@article{gu2016q,
  title={Q-prop: Sample-efficient policy gradient with an off-policy critic},
  author={Gu, Shixiang and Lillicrap, Timothy and Ghahramani, Zoubin and Turner, Richard E and Levine, Sergey},
  journal={arXiv preprint arXiv:1611.02247},
  year={2016}
}

@Misc{silver2015,author = {David Silver},title = {Lectures on 
Reinforcement Learning},howpublished = {\textsc{url:}~\url
{https://www.davidsilver.uk/teaching/}},year = {2015}}