{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T05:02:29Z","timestamp":1755838949539,"version":"3.40.4"},"publisher-location":"Cham","reference-count":47,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030461324"},{"type":"electronic","value":"9783030461331"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-46133-1_2","type":"book-chapter","created":{"date-parts":[[2020,4,30]],"date-time":"2020-04-30T07:08:58Z","timestamp":1588230538000},"page":"19-34","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Sample-Efficient Model-Free Reinforcement Learning with Off-Policy Critics"],"prefix":"10.1007","author":[{"given":"Denis","family":"Steckelmacher","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"H\u00e9l\u00e8ne","family":"Plisnier","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Diederik M.","family":"Roijers","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ann","family":"Now\u00e9","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2020,4,30]]},"reference":[{"key":"2_CR1","unstructured":"Agrawal, S., Goyal, N.: Analysis of Thompson sampling for the multi-armed bandit problem. In: Conference on Learning Theory (COLT) (2012)"},{"key":"2_CR2","unstructured":"Anthony, T., Tian, Z., Barber, D.: Thinking fast and slow with deep learning and tree search. In: Advances in Neural Information Processing Systems (NIPS), pp. 5366\u20135376 (2017)"},{"key":"2_CR3","unstructured":"Arjona-Medina, J.A., Gillhofer, M., Widrich, M., Unterthiner, T., Hochreiter, S.: RUDDER: return decomposition for delayed rewards. arXiv abs\/1806.07857 (2018)"},{"issue":"5","key":"2_CR4","doi-asserted-by":"publisher","first-page":"834","DOI":"10.1109\/TSMC.1983.6313077","volume":"13","author":"AG Barto","year":"1983","unstructured":"Barto, A.G., Sutton, R.S., Anderson, C.W.: Neuronlike adaptive elements that can solve difficult learning control problems. IEEE Trans. Syst. Man Cybern. 13(5), 834\u2013846 (1983)","journal-title":"IEEE Trans. Syst. Man Cybern."},{"key":"2_CR5","unstructured":"Bellemare, M.G., Dabney, W., Munos, R.: A distributional perspective on reinforcement learning. In: International Conference on Machine Learning (ICML), pp. 449\u2013458 (2017)"},{"key":"2_CR6","first-page":"679","volume":"6","author":"R Bellman","year":"1957","unstructured":"Bellman, R.: A Markovian decision process. J. Math. Mech. 6, 679\u2013684 (1957)","journal-title":"J. Math. Mech."},{"key":"2_CR7","unstructured":"B\u00f6hmer, W., Guo, R., Obermayer, K.: Non-deterministic policy improvement stabilizes approximated reinforcement learning. arXiv abs\/1612.07548 (2016)"},{"key":"2_CR8","unstructured":"Brockman, G., et al.: OpenAI Gym (2016)"},{"key":"2_CR9","unstructured":"Burda, Y., Edwards, H., Storkey, A., Klimov, O.: Exploration by random network distillation. arXiv abs\/1810.12894 (2018)"},{"key":"2_CR10","unstructured":"Chapelle, O., Li, L.: An empirical evaluation of Thompson sampling. In: Advances in Neural Information Processing Systems (NIPS), pp. 2249\u20132257 (2011)"},{"key":"2_CR11","unstructured":"Chen, R.Y., Sidor, S., Abbeel, P., Schulman, J.: UCB exploration via Q-ensembles. arXiv abs\/1706.01502 (2017)"},{"key":"2_CR12","unstructured":"Degris, T., White, M., Sutton, R.S.: Linear off-policy actor-critic. In: International Conference on Machine Learning (ICML) (2012)"},{"key":"2_CR13","unstructured":"Fu, J., Kumar, A., Soh, M., Levine, S.: Diagnosing bottlenecks in deep Q-learning algorithms. arXiv abs\/1902.10250 (2019)"},{"key":"2_CR14","unstructured":"Fujimoto, S., Hoof, H.V., Meger, D.: Addressing function approximation error in actor-critic methods. In: International Conference on Machine Learning (ICML), pp. 1582\u20131591 (2018)"},{"key":"2_CR15","unstructured":"Gruslys, A., Azar, M.G., Bellemare, M.G., Munos, R.: The reactor: a sample-efficient actor-critic architecture. arXiv abs\/1704.04651 (2017)"},{"key":"2_CR16","unstructured":"Gu, S., Lillicrap, T., Turner, R.E., Ghahramani, Z., Sch\u00f6lkopf, B., Levine, S.: Interpolated policy gradient: merging on-policy and off-policy gradient estimation for deep reinforcement learning. In: Advances in Neural Information Processing Systems (NIPS), pp. 3849\u20133858 (2017)"},{"key":"2_CR17","unstructured":"Gu, S., Lillicrap, T., Ghahramani, Z., Turner, R.E., Levine, S.: Q-prop: sample-efficient policy gradient with an off-policy critic. In: International Conference on Learning Representations (ICLR) (2017)"},{"key":"2_CR18","unstructured":"Haarnoja, T., Zhou, A., Abbeel, P., Levine, S.: Soft actor-critic: off-policy maximum entropy deep reinforcement learning with a stochastic actor. arXiv abs\/1801.01290 (2018)"},{"key":"2_CR19","unstructured":"van Hasselt, H.: Double Q-learning. In: Neural Information Processing Systems (NIPS), p. 9 (2010)"},{"key":"2_CR20","unstructured":"Hessel, M., et al.: Rainbow: combining improvements in deep reinforcement learning. arXiv abs\/1710.02298 (2017)"},{"key":"2_CR21","unstructured":"Kakade, S., Langford, J.: Approximately optimal approximate reinforcement learning. In: International Conference on Machine Learning (ICML), pp. 267\u2013274 (2002)"},{"key":"2_CR22","unstructured":"Kingma, D., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"issue":"1","key":"2_CR23","doi-asserted-by":"publisher","first-page":"94","DOI":"10.1137\/S036301299731669X","volume":"38","author":"VR Konda","year":"1999","unstructured":"Konda, V.R., Borkar, V.S.: Actor-critic-type learning algorithms for Markov decision processes. SIAM J. Control Opt. 38(1), 94\u2013123 (1999)","journal-title":"SIAM J. Control Opt."},{"key":"2_CR24","unstructured":"Lillicrap, T.P., et al.: Continuous control with deep reinforcement learning. arXiv abs\/1509.02971 (2015)"},{"issue":"3\u20134","key":"2_CR25","first-page":"293","volume":"8","author":"LJ Lin","year":"1992","unstructured":"Lin, L.J.: Self-improving reactive agents based on reinforcement learning, planning and teaching. Mach. Learn. 8(3\u20134), 293\u2013321 (1992)","journal-title":"Mach. Learn."},{"key":"2_CR26","unstructured":"Mnih, V., et al.: Asynchronous methods for deep reinforcement learning. In: International Conference on Machine Learning (ICML), p. 10 (2016)"},{"key":"2_CR27","unstructured":"Nikolov, N., Kirschner, J., Berkenkamp, F., Andreas, K.: Information-directed exploration for deep reinforcement learning. In: International Conference on Learning Representations (ICLR) (2019, in preparation)"},{"key":"2_CR28","unstructured":"O\u2019Donoghue, B., Munos, R., Kavukcuoglu, K., Mnih, V.: PGQ: combining policy gradient and Q-learning. In: International Conference on Learning Representations (ICLR), p. 15 (2017)"},{"key":"2_CR29","unstructured":"Osband, I., Aslanides, J., Cassirer, A.: Randomized prior functions for deep reinforcement learning. arXiv abs\/1806.03335 (2018)"},{"key":"2_CR30","unstructured":"Osband, I., Blundell, C., Pritzel, A., Van Roy, B.: Deep exploration via bootstrapped DQN. In: Advances in Neural Information Processing Systems (NIPS) (2016)"},{"key":"2_CR31","unstructured":"Parisotto, E., Ba, J., Salakhutdinov, R.: Actor-mimic: deep multitask and transfer reinforcement learning. In: International Conference on Learning Representations (ICLR) (2016)"},{"key":"2_CR32","doi-asserted-by":"crossref","unstructured":"Pazis, J., Lagoudakis, M.G.: Binary action search for learning continuous-action control policies. In: International Conference on Machine Learning (ICML), pp. 793\u2013800. ACM (2009)","DOI":"10.1145\/1553374.1553476"},{"key":"2_CR33","unstructured":"Pirotta, M., Restelli, M., Pecorino, A., Calandriello, D.: Safe policy iteration. In: Proceedings of the 30th International Conference on Machine Learning (ICML), pp. 307\u2013315 (2013)"},{"key":"2_CR34","unstructured":"Rusu, A.A., et al.: Policy distillation. arXiv abs\/1511.06295 (2015)"},{"key":"2_CR35","unstructured":"Scherrer, B.: Approximate policy iteration schemes: a comparison. In: Proceedings of the 31th International Conference on Machine Learning (ICML), pp. 1314\u20131322 (2014)"},{"key":"2_CR36","unstructured":"Schulman, J., Levine, S., Abbeel, P., Jordan, M.I., Moritz, P.: Trust region policy optimization. In: International Conference on Machine Learning (ICML) (2015)"},{"key":"2_CR37","unstructured":"Schulman, J., Wolski, F., Dhariwal, P., Radford, A., Klimov, O.: Proximal policy optimization algorithms. arXiv abs\/1707.06347 (2017)"},{"issue":"7676","key":"2_CR38","doi-asserted-by":"publisher","first-page":"354","DOI":"10.1038\/nature24270","volume":"550","author":"D Silver","year":"2017","unstructured":"Silver, D., et al.: Mastering the game of go without human knowledge. Nature 550(7676), 354 (2017)","journal-title":"Nature"},{"key":"2_CR39","unstructured":"Sun, W., Gordon, G.J., Boots, B., Bagnell, J.A.: Dual policy iteration. arXiv abs\/1805.10755 (2018)"},{"key":"2_CR40","unstructured":"Sutton, R., McAllester, D., Singh, S., Mansour, Y.: Policy gradient methods for reinforcement learning with function approximation. In: Neural Information Processing Systems (NIPS), p. 7 (2000)"},{"key":"2_CR41","unstructured":"Thomas, P.S., Theocharous, G., Ghavamzadeh, M.: High confidence policy improvement. In: International Conference on Machine Learning (ICML), pp. 2380\u20132388 (2015)"},{"issue":"3\/4","key":"2_CR42","doi-asserted-by":"publisher","first-page":"285","DOI":"10.2307\/2332286","volume":"25","author":"WR Thompson","year":"1933","unstructured":"Thompson, W.R.: On the likelihood that one unknown probability exceeds another in view of the evidence of two samples. Biometrika 25(3\/4), 285\u2013294 (1933)","journal-title":"Biometrika"},{"key":"2_CR43","unstructured":"Wagner, P.: A reinterpretation of the policy oscillation phenomenon in approximate policy iteration. In: Advances in Neural Information Processing Systems (NIPS), pp. 2573\u20132581 (2011)"},{"key":"2_CR44","unstructured":"Wang, Z., et al.: Sample efficient actor-critic with experience replay. Technical report (2016)"},{"issue":"3\u20134","key":"2_CR45","first-page":"279","volume":"8","author":"C Watkins","year":"1992","unstructured":"Watkins, C., Dayan, P.: Q-learning. Mach. Learn. 8(3\u20134), 279\u2013292 (1992)","journal-title":"Mach. Learn."},{"issue":"3","key":"2_CR46","first-page":"229","volume":"8","author":"RJ Williams","year":"1992","unstructured":"Williams, R.J.: Simple statistical gradient-following algorithms for connectionist reinforcement learning. Mach. Learn. 8(3), 229\u2013256 (1992)","journal-title":"Mach. Learn."},{"key":"2_CR47","unstructured":"Wu, Y., Mansimov, E., Grosse, R.B., Liao, S., Ba, J.: Scalable trust-region method for deep reinforcement learning using Kronecker-factored approximation. In: Advances in Neural Information Processing Systems (NIPS), pp. 5279\u20135288 (2017)"}],"container-title":["Lecture Notes in Computer Science","Machine Learning and Knowledge Discovery in Databases"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-46133-1_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,29]],"date-time":"2025-04-29T22:03:58Z","timestamp":1745964238000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-46133-1_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030461324","9783030461331"],"references-count":47,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-46133-1_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"30 April 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECML PKDD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Joint European Conference on Machine Learning and Knowledge Discovery in Databases","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"W\u00fcrzburg","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2019","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 September 2019","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 September 2019","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecml2019","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/ecmlpkdd2019.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Microsoft CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"733","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"130","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"18% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.04","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5.3","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"ECML PKDD Workshops Information: single-blind review, submissions: 200, full papers accepted: 70, short papers accepted: 46","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}