{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,5]],"date-time":"2025-12-05T18:53:51Z","timestamp":1764960831680,"version":"3.46.0"},"reference-count":54,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"3","license":[{"start":{"date-parts":[[2025,3,1]],"date-time":"2025-03-01T00:00:00Z","timestamp":1740787200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/100017149","name":"DeepMind","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100017149","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100013373","name":"Alberta Machine Intelligence Institute","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100013373","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100000038","name":"Natural Sciences and Engineering Research Council of Canada","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100000038","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100007631","name":"Canadian Institute for Advanced Research","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100007631","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Neural Netw. Learning Syst."],"published-print":{"date-parts":[[2025,3]]},"DOI":"10.1109\/tnnls.2024.3373749","type":"journal-article","created":{"date-parts":[[2024,6,10]],"date-time":"2024-06-10T13:26:23Z","timestamp":1718025983000},"page":"4477-4491","source":"Crossref","is-referenced-by-count":1,"title":["Off-Policy Prediction Learning: An Empirical Study of Online Algorithms"],"prefix":"10.1109","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5853-5049","authenticated-orcid":false,"given":"Sina","family":"Ghiassian","sequence":"first","affiliation":[{"name":"Department of Computing Science, University of Alberta, Edmonton, AB, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4641-7349","authenticated-orcid":false,"given":"Banafsheh","family":"Rafiee","sequence":"additional","affiliation":[{"name":"Reinforcement Learning and Artificial Intelligence (RLAI) Laboratory, University of Alberta, Edmonton, AB, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3679-3415","authenticated-orcid":false,"given":"Richard S.","family":"Sutton","sequence":"additional","affiliation":[{"name":"Department of Computing Science, University of Alberta, Edmonton, AB, Canada"}]}],"member":"263","reference":[{"article-title":"Learning from delayed rewards","year":"1989","author":"Watkins","key":"ref1"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/BF00992698"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1038\/nature14236"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v30i1.10295"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11796"},{"key":"ref6","first-page":"1407","article-title":"IMPALA: Scalable distributed deeP-RL with importance weighted actor-learner architectures","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Espeholt"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i6.20660"},{"key":"ref8","first-page":"761","article-title":"Horde: A scalable real-time architecture for learning knowledge from unsupervised sensorimotor interaction","volume-title":"Proc. 10th Int. Conf. Auto. Agents Multiagent Syst.","volume":"2","author":"Sutton"},{"article-title":"Developing a predictive approach to knowledge","year":"2015","author":"White","key":"ref9"},{"key":"ref10","article-title":"Representing knowledge as predictions (and state as Knowledge)","author":"Ring","year":"2021","journal-title":"arXiv:2112.06336"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1016\/S0004-3702(99)00052-1"},{"key":"ref12","first-page":"1","article-title":"Predictive representations of state","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"14","author":"Littman"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/1102351.1102463"},{"key":"ref14","article-title":"Reinforcement learning with unsupervised auxiliary tasks","author":"Jaderberg","year":"2016","journal-title":"arXiv:1611.05397"},{"article-title":"Safe reinforcement learning","year":"2015","author":"Thomas","key":"ref15"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2022.3172130"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.3015767"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2019.2897814"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1016\/B978-1-55860-377-6.50013-X"},{"key":"ref20","first-page":"417","article-title":"Off-policy temporal-difference learning with function approximation","volume-title":"Proc. ICML","author":"Precup"},{"article-title":"Gradient temporal-difference learning algorithms","year":"2011","author":"Maei","key":"ref21"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/1553374.1553501"},{"article-title":"Reinforcement learning through gradient descent","year":"1999","author":"Baird","key":"ref23"},{"issue":"73","key":"ref24","first-page":"1","article-title":"An emphatic approach to the problem of off-policy temporal-difference learning","volume":"17","author":"Sutton","year":"2016","journal-title":"J. Mach. Learn. Res."},{"key":"ref25","article-title":"Proximal reinforcement learning: A new theory of sequential decision making in primal-dual spaces","author":"Mahadevan","year":"2014","journal-title":"arXiv:1405.6757"},{"key":"ref26","first-page":"3524","article-title":"Gradient temporal-difference learning with regularized corrections","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Ghiassian"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v30i1.10227"},{"key":"ref28","first-page":"1","article-title":"Safe and efficient off-policy reinforcement learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"29","author":"Munos"},{"key":"ref29","article-title":"Multi-step off-policy learning without importance sampling ratios","author":"Mahmood","year":"2017","journal-title":"arXiv:1702.03006"},{"key":"ref30","article-title":"MinAtar: An Atari-inspired testbed for thorough and reproducible reinforcement learning experiments","author":"Young","year":"2019","journal-title":"arXiv:1903.03176"},{"key":"ref31","first-page":"289","article-title":"Off-policy learning with eligibility traces: A survey","volume":"15","author":"Geist","year":"2014","journal-title":"J. Mach. Learn. Res."},{"issue":"1","key":"ref32","first-page":"809","article-title":"Policy evaluation with temporal differences: A survey and comparison","volume":"15","author":"Dann","year":"2014","journal-title":"J. Mach. Learn. Res."},{"key":"ref33","first-page":"494","article-title":"Investigating practical linear temporal difference learning","volume-title":"Proc. Int. Conf. Auto. Agents Multiagent Syst.","author":"White"},{"key":"ref34","first-page":"1","article-title":"Two-timescale networks for nonlinear value function approximation","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Chung"},{"key":"ref35","first-page":"759","article-title":"Eligibility traces for off-policy policy evaluation","volume-title":"Proc. 17th Int. Conf. Mach. Learn.","author":"Precup"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/BF00115009"},{"key":"ref37","first-page":"4195","article-title":"Proximal gradient temporal difference learning algorithms","volume-title":"Proc. IJCAI","author":"Liu"},{"article-title":"Faster gradient-TD algorithms","year":"2012","author":"Hackman","key":"ref38"},{"issue":"48","key":"ref39","first-page":"1","article-title":"On generalized Bellman equations and temporal-difference learning","volume":"19","author":"Yu","year":"2018","journal-title":"J. Mach. Learn. Res."},{"key":"ref40","first-page":"332","article-title":"Prediction in intelligence: An empirical comparison of off-policy algorithms on robots","volume-title":"Proc. 18th Int. Conf. Auto. Agents MultiAgent Syst.","author":"Rafiee"},{"key":"ref41","first-page":"1","article-title":"Prediction driven behavior: Learning predictions that drive fixed responses","volume-title":"Proc. Workshops 28th AAAI Conf. Artif. Intell.","author":"Modayil"},{"key":"ref42","article-title":"A first empirical study of emphatic temporal difference learning","author":"Ghiassian","year":"2017","journal-title":"arXiv:1705.04185"},{"issue":"1","key":"ref43","first-page":"872","article-title":"Adaptive step-size for online temporal difference learning","volume-title":"Proc. AAAI Conf. Artif. Intell.","volume":"26","author":"Dabney"},{"key":"ref44","article-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2014","journal-title":"arXiv:1412.6980"},{"key":"ref45","article-title":"On the convergence of Adam and beyond","author":"Reddi","year":"2019","journal-title":"arXiv:1904.09237"},{"issue":"7","key":"ref46","first-page":"2121","article-title":"Adaptive subgradient methods for online learning and stochastic optimization","volume":"12","author":"Duchi","year":"2011","journal-title":"J. Mach. Learn. Res."},{"volume-title":"Reinforcement Learning: An Introduction","year":"2018","author":"Sutton","key":"ref47"},{"issue":"21","key":"ref48","first-page":"1609","article-title":"A convergent O (n) algorithm for off-policy temporal-difference learning with linear function approximation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"21","author":"Sutton"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1287\/10-SSY011"},{"key":"ref50","first-page":"1","article-title":"A kernel loss for solving the Bellman equation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Feng"},{"key":"ref51","first-page":"1125","article-title":"SBEED: Convergent reinforcement learning with nonlinear function approximation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Dai"},{"key":"ref52","article-title":"Should all temporal difference learning use emphasis?","author":"Gu","year":"2019","journal-title":"arXiv:1903.00194"},{"key":"ref53","first-page":"3742","article-title":"Unifying task specification in reinforcement learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"White"},{"key":"ref54","first-page":"4955","article-title":"Convergent tree backup and retrace with function approximation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Touati"}],"container-title":["IEEE Transactions on Neural Networks and Learning Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/5962385\/10908444\/10552310.pdf?arnumber=10552310","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,5]],"date-time":"2025-12-05T18:39:25Z","timestamp":1764959965000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10552310\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3]]},"references-count":54,"journal-issue":{"issue":"3"},"URL":"https:\/\/doi.org\/10.1109\/tnnls.2024.3373749","relation":{},"ISSN":["2162-237X","2162-2388"],"issn-type":[{"type":"print","value":"2162-237X"},{"type":"electronic","value":"2162-2388"}],"subject":[],"published":{"date-parts":[[2025,3]]}}}