{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T07:07:57Z","timestamp":1774940877232,"version":"3.50.1"},"publisher-location":"Cham","reference-count":23,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783319463780","type":"print"},{"value":"9783319463797","type":"electronic"}],"license":[{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016]]},"DOI":"10.1007\/978-3-319-46379-7_21","type":"book-chapter","created":{"date-parts":[[2016,9,20]],"date-time":"2016-09-20T10:54:33Z","timestamp":1474368873000},"page":"305-320","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["Q($$\\lambda $$) with Off-Policy Corrections"],"prefix":"10.1007","author":[{"given":"Anna","family":"Harutyunyan","sequence":"first","affiliation":[]},{"given":"Marc G.","family":"Bellemare","sequence":"additional","affiliation":[]},{"given":"Tom","family":"Stepleton","sequence":"additional","affiliation":[]},{"given":"R\u00e9mi","family":"Munos","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2016,9,21]]},"reference":[{"key":"21_CR1","volume-title":"Dynamic Programming","author":"R Bellman","year":"1957","unstructured":"Bellman, R.: Dynamic Programming. Princeton University Press, Princeton (1957)"},{"key":"21_CR2","volume-title":"Neuro-Dynamic Programming","author":"DP Bertsekas","year":"1996","unstructured":"Bertsekas, D.P., Tsitsiklis, J.N.: Neuro-Dynamic Programming. Athena Scientific, Belmont (1996)"},{"key":"21_CR3","doi-asserted-by":"crossref","unstructured":"Hallak, A., Tamar, A., Munos, R., Mannor, S.: Generalized emphatic temporal difference learning: bias-variance analysis (2015). arXiv:1509.05172","DOI":"10.1609\/aaai.v30i1.10227"},{"key":"21_CR4","unstructured":"Kearns, M.J., Singh, S.P.: Bias-variance error bounds for temporal difference updates. In: Conference on Computational Learning Theory, pp. 142\u2013147 (2000)"},{"key":"21_CR5","unstructured":"Mahmood, A.R., Sutton, R.S.: Off-policy learning based on weighted importance sampling with linear computational complexity. In: Conference on Uncertainty in Artificial Intelligence (2015)"},{"key":"21_CR6","unstructured":"Mahmood, A.R., Huizhen, Y., White, M., Sutton, R.S.: Emphatic temporal-difference learning. arXiv preprint arXiv:1507.01569 (2015)"},{"key":"21_CR7","unstructured":"Munos, R., Stepleton, T., Harutyunyan, A., Bellemare, M.G.: Safe and efficient off-policy reinforcement learning. In: Advances in Neural Information Processing Systems (2016)"},{"issue":"1\u20133","key":"21_CR8","first-page":"283","volume":"22","author":"J Peng","year":"1996","unstructured":"Peng, J., Williams, R.J.: Incremental multi-step q-learning. Mach. Learn. 22(1\u20133), 283\u2013290 (1996)","journal-title":"Mach. Learn."},{"key":"21_CR9","unstructured":"Precup, D., Sutton, R.S., Singh, S.: Eligibility traces for off-policy policy evaluation. In: International Conference on Machine Learning (2000)"},{"key":"21_CR10","unstructured":"Precup, D., Sutton, R.S., Dasgupta, S.: Off-policy temporal-difference learning with function approximation. In: International Conference on Machine Learning (2001)"},{"key":"21_CR11","doi-asserted-by":"publisher","DOI":"10.1002\/9780470316887","volume-title":"Markov Decision Processes: Discrete Stochastic Dynamic Programming","author":"ML Puterman","year":"1994","unstructured":"Puterman, M.L.: Markov Decision Processes: Discrete Stochastic Dynamic Programming, 1st edn. Wiley, New York (1994)","edition":"1"},{"key":"21_CR12","unstructured":"Randl\u00f8v, J., Alstr\u00f8m, P.: Learning to drive a bicycle using reinforcement learning and shaping. In: International Conference on Machine Learning (1998)"},{"key":"21_CR13","unstructured":"Rummery, G.A., Niranjan, M.: On-line q-learning using connectionist systems. Technical report, Cambridge University Engineering Department (1994)"},{"issue":"1","key":"21_CR14","doi-asserted-by":"publisher","first-page":"5","DOI":"10.1023\/A:1007495401240","volume":"32","author":"S Singh","year":"1998","unstructured":"Singh, S., Dayan, P.: Analytical mean squared error curves for temporal difference learning. Mach. Learn. 32(1), 5\u201340 (1998)","journal-title":"Mach. Learn."},{"issue":"1","key":"21_CR15","first-page":"9","volume":"3","author":"RS Sutton","year":"1988","unstructured":"Sutton, R.S.: Learning to predict by the methods of temporal differences. Mach. learn. 3(1), 9\u201344 (1988)","journal-title":"Mach. learn."},{"key":"21_CR16","unstructured":"Sutton, R.S.: Generalization in reinforcement learning: successful examples using sparse coarse coding. In: Advances in Neural Information Processing Systems (1996)"},{"key":"21_CR17","volume-title":"Reinforcement Learning: An Introduction","author":"RS Sutton","year":"1998","unstructured":"Sutton, R.S., Barto, A.G.: Reinforcement Learning: An Introduction. Cambridge University Press, Cambridge (1998)"},{"key":"21_CR18","unstructured":"Sutton, R.S., Mahmood, A.R., Precup, D., van Hasselt, H.: A new q ($$\\lambda $$) with interim forward view and monte carlo equivalence. In: International Conference on Machine Learning, pp. 568\u2013576 (2014)"},{"key":"21_CR19","unstructured":"van Hasselt, H.P.: Insights in reinforcement learning: formal analysis and empirical evaluation of temporal-difference learning algorithms. Ph.D. thesis, Universiteit Utrecht, January 2011"},{"key":"21_CR20","unstructured":"van Seijen, H., Sutton, R.S.: True online TD($$\\lambda $$). In: International Conference on Machine Learning, pp. 692\u2013700 (2014)"},{"key":"21_CR21","doi-asserted-by":"crossref","unstructured":"van Seijen, H., van Hasselt, H., Whiteson, S., Wiering, M.: A theoretical and empirical analysis of expected Sarsa. In: Adaptive Dynamic Programming and Reinforcement Learning, pp. 177\u2013184. IEEE (2009)","DOI":"10.1109\/ADPRL.2009.4927542"},{"key":"21_CR22","first-page":"272","volume":"8","author":"CJCH Watkins","year":"1992","unstructured":"Watkins, C.J.C.H., Dayan, P.: Q-learning. Mach. Learn. 8, 272\u2013292 (1992)","journal-title":"Mach. Learn."},{"key":"21_CR23","unstructured":"Watkins, C.J.C.H.: Learning from delayed rewards. Ph.D. thesis, King\u2019s College, Cambridge (1989)"}],"container-title":["Lecture Notes in Computer Science","Algorithmic Learning Theory"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-46379-7_21","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,11]],"date-time":"2024-03-11T14:07:06Z","timestamp":1710166026000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-319-46379-7_21"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016]]},"ISBN":["9783319463780","9783319463797"],"references-count":23,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-46379-7_21","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2016]]},"assertion":[{"value":"21 September 2016","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ALT","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Algorithmic Learning Theory","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Bari","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2016","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19 October 2016","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 October 2016","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"alt2016","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}