{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,30]],"date-time":"2026-03-30T14:23:26Z","timestamp":1774880606618,"version":"3.50.1"},"reference-count":37,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2019,2,9]],"date-time":"2019-02-09T00:00:00Z","timestamp":1549670400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61836003"],"award-info":[{"award-number":["61836003"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61573150"],"award-info":[{"award-number":["61573150"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2019,8]]},"DOI":"10.1007\/s10489-019-01417-4","type":"journal-article","created":{"date-parts":[[2019,2,9]],"date-time":"2019-02-09T14:49:34Z","timestamp":1549723774000},"page":"2874-2888","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":16,"title":["A novel multi-step reinforcement learning method for solving reward hacking"],"prefix":"10.1007","volume":"49","author":[{"given":"Yinlong","family":"Yuan","sequence":"first","affiliation":[]},{"given":"Zhu Liang","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Zhenghui","family":"Gu","sequence":"additional","affiliation":[]},{"given":"Xiaoyan","family":"Deng","sequence":"additional","affiliation":[]},{"given":"Yuanqing","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,2,9]]},"reference":[{"key":"1417_CR1","unstructured":"Amin K, Jiang N, Singh S (2017) Repeated inverse reinforcement learning. In: Advances in Neural Information Processing Systems (NIPS), pp 1815\u20131824"},{"key":"1417_CR2","unstructured":"Amodei D, Olah C, Steinhardt J, Christiano P, Schulman J, Man\u00e9 D (2016) Concrete problems in ai safety. arXiv: 160606565"},{"key":"1417_CR3","doi-asserted-by":"publisher","first-page":"30","DOI":"10.1016\/j.patrec.2018.04.012","volume":"111","author":"Y An","year":"2018","unstructured":"An Y, Ding S, Shi S, Li J (2018) Discrete space reinforcement learning algorithm based on support vector machine classification. Pattern Recogn Lett 111:30\u201335","journal-title":"Pattern Recogn Lett"},{"issue":"6","key":"1417_CR4","doi-asserted-by":"publisher","first-page":"26","DOI":"10.1109\/MSP.2017.2743240","volume":"34","author":"K Arulkumaran","year":"2017","unstructured":"Arulkumaran K, Deisenroth MP, Brundage M, Bharath AA (2017) Deep reinforcement learning: a brief survey. IEEE Signal Proc Mag 34(6):26\u201338","journal-title":"IEEE Signal Proc Mag"},{"key":"1417_CR5","unstructured":"Aslund H, Mhamdi EME, Guerraoui R, Maurer A (2018) Virtuously safe reinforcement learning. arXiv: 180511447"},{"key":"1417_CR6","doi-asserted-by":"crossref","unstructured":"Bragg J, Habli I (2018) What is acceptably safe for reinforcement learning. In: International workshop on artificial intelligence safety engineering","DOI":"10.1007\/978-3-319-99229-7_35"},{"key":"1417_CR7","doi-asserted-by":"crossref","unstructured":"De Asis K, Hernandez-Garcia JF, Holland GZ, Sutton RS (2017) Multi-step reinforcement learning: A unifying algorithm. arXiv: 170301327v1","DOI":"10.1609\/aaai.v32i1.11631"},{"issue":"1","key":"1417_CR8","doi-asserted-by":"publisher","first-page":"219","DOI":"10.1162\/089976600300015961","volume":"12","author":"K Doya","year":"2000","unstructured":"Doya K (2000) Reinforcement learning in continuous time and space. Neural Comput 12(1):219\u2013245","journal-title":"Neural Comput"},{"key":"1417_CR9","doi-asserted-by":"crossref","unstructured":"Everitt T, Krakovna V, Orseau L, Hutter M, Legg S (2017) Reinforcement learning with a corrupted reward channel. In: International joint conferences on artificial intelligence (IJCAI), pp 4705\u20134713","DOI":"10.24963\/ijcai.2017\/656"},{"key":"1417_CR10","doi-asserted-by":"crossref","unstructured":"Fernandez-Gauna B, Osa JL, Gra\u00f1a M (2017) Experiments of conditioned reinforcement learning in continuous space control tasks. Neurocomputing 271:38\u201347","DOI":"10.1016\/j.neucom.2016.08.155"},{"key":"1417_CR11","unstructured":"Garcia J, Femandez F (2015) A comprehensive survey on safe reinforcement learning. J Mach Learn Res 16:1437\u20131480"},{"key":"1417_CR12","unstructured":"Hadfield-Menell D, Milli S, Abbeel P, Russell SJ, Dragan A (2017) Inverse reward design. In: Advances in neural information processing systems (NIPS), pp 6765\u20136774"},{"key":"1417_CR13","unstructured":"Hessel M, Modayil J, Van Hasselt H, Schaul T, Ostrovski G, Dabney W, Horgan D, Piot B, Azar M, Silver D (2017) Rainbow: Combining improvements in deep reinforcement learning. arXiv: 171002298"},{"key":"1417_CR14","unstructured":"Horgan D, Quan J, Budden D, Barth-Maron G, Hessel M, Van Hasselt H, Silver D (2018) Distributed prioritized experience replay. arXiv: 180300933"},{"issue":"6","key":"1417_CR15","doi-asserted-by":"publisher","first-page":"1185","DOI":"10.1162\/neco.1994.6.6.1185","volume":"6","author":"T Jaakkola","year":"1993","unstructured":"Jaakkola T, Jordan MI, Singh SP (1993) Convergence of stochastic iterative dynamic programming algorithms. Neural Comput 6(6):1185\u20131201","journal-title":"Neural Comput"},{"key":"1417_CR16","unstructured":"Laurent O, Stuart A (2016) Safely interruptible agents. In: Association for uncertainty in artificial intelligence"},{"key":"1417_CR17","unstructured":"Leike J, Martic M, Krakovna V, Ortega P A, Everitt T, Lefrancq A, Orseau L, Legg S (2017) Ai safety gridworlds. arXiv: 171109883"},{"issue":"3","key":"1417_CR18","doi-asserted-by":"publisher","first-page":"305","DOI":"10.3758\/s13420-012-0082-6","volume":"40","author":"EA Ludvig","year":"2012","unstructured":"Ludvig EA, Sutton RS, Kehoe EJ (2012) Evaluating the td model of classical conditioning. Learning & Behavior 40(3):305\u2013 319","journal-title":"Learning & Behavior"},{"issue":"2","key":"1417_CR19","doi-asserted-by":"publisher","first-page":"900","DOI":"10.1109\/TIT.2008.2009797","volume":"55","author":"D Marco","year":"2009","unstructured":"Marco D (2009) Markov random processes are neither bandlimited nor recoverable from samples or after quantization. IEEE Trans Inf Theory 55(2):900\u2013905","journal-title":"IEEE Trans Inf Theory"},{"key":"1417_CR20","unstructured":"Mnih V, Kavukcuoglu K, Silver D, Graves A, Antonoglou I, Wierstra D, Riedmiller M (2013) Playing atari with deep reinforcement learning. In: Annual Conference on Neural Information Processing Systems (NIPS)"},{"issue":"7540","key":"1417_CR21","doi-asserted-by":"publisher","first-page":"529","DOI":"10.1038\/nature14236","volume":"518","author":"V Mnih","year":"2015","unstructured":"Mnih V, Kavukcuoglu K, Silver D, Rusu AA, Veness J, Bellemare MG, Graves A, Riedmiller M, Fidjeland A, Ostrovski G et al (2015) Human-level control through deep reinforcement learning. Nature 518(7540):529\u2013533","journal-title":"Nature"},{"key":"1417_CR22","unstructured":"Mnih V, Badia AP, Mirza M, Graves A, Lillicrap T, Harley T, Silver D, Kavukcuoglu K (2016) Asynchronous methods for deep reinforcement learning. In: International conference on machine learning (ICML), pp 1928\u20131937"},{"issue":"2","key":"1417_CR23","doi-asserted-by":"publisher","first-page":"443","DOI":"10.1007\/s10994-017-5666-0","volume":"107","author":"TM Moerland","year":"2018","unstructured":"Moerland TM, Broekens J, Jonker CM (2018) Emotion in reinforcement learning agents and robots: a survey. Mach Learn 107(2):443\u2013480","journal-title":"Mach Learn"},{"issue":"3","key":"1417_CR24","first-page":"1073","volume":"6","author":"SA Murphy","year":"2005","unstructured":"Murphy SA (2005) A generalization error for q-learning. Journal of Machine Learning Research Jmlr 6(3):1073","journal-title":"Journal of Machine Learning Research Jmlr"},{"key":"1417_CR25","doi-asserted-by":"publisher","first-page":"487","DOI":"10.1007\/s10489-015-0665-y","volume":"43","author":"E Pakizeh","year":"2015","unstructured":"Pakizeh E, Pedram M M, Palhang M (2015) Multi-criteria expertness based cooperative method for sarsa and eligibility trace algorithms. Appl Intell 43:487\u2013498","journal-title":"Appl Intell"},{"key":"1417_CR26","first-page":"886","volume":"1","author":"S Pathak","year":"2017","unstructured":"Pathak S, Pulina L, Tacchella A (2017) Verification and repair of control policies for safe reinforcement learning. Appl Intell 1:886\u2013908","journal-title":"Appl Intell"},{"issue":"7-9","key":"1417_CR27","doi-asserted-by":"publisher","first-page":"1180","DOI":"10.1016\/j.neucom.2007.11.026","volume":"71","author":"J Peters","year":"2008","unstructured":"Peters J, Schaal S (2008) Natural actor-critic. Neurocomputing 71(7-9):1180\u20131190","journal-title":"Neurocomputing"},{"key":"1417_CR28","unstructured":"Richard GF, Shlomo Z (2016) Safety in ai-hri: challenges complementing user experience quality. In: AAAI Conference on Artificial Intelligence(AAAI"},{"issue":"1","key":"1417_CR29","first-page":"5057","volume":"17","author":"HV Seijen","year":"2016","unstructured":"Seijen HV, Mahmood AR, Pilarski PM, Machado MC, Sutton RS (2016) True online temporal-difference learning. J Mach Learn Res 17(1):5057\u20135096","journal-title":"J Mach Learn Res"},{"issue":"3","key":"1417_CR30","doi-asserted-by":"publisher","first-page":"287","DOI":"10.1023\/A:1007678930559","volume":"38","author":"S Singh","year":"2000","unstructured":"Singh S, Jaakkola T, Littman ML, Szepesvari C (2000) Convergence results for single-step on-policy reinforcement-learning algorithms. Mach Learn 38(3):287\u2013308","journal-title":"Mach Learn"},{"issue":"4-6","key":"1417_CR31","doi-asserted-by":"publisher","first-page":"523","DOI":"10.1016\/S0893-6080(02)00046-1","volume":"15","author":"RE Suri","year":"2002","unstructured":"Suri RE (2002) Td models of reward predictive responses in dopamine neurons. Neural Netw 15(4-6):523\u2013533","journal-title":"Neural Netw"},{"key":"1417_CR32","unstructured":"Sutton R, Barto A (2017) Introduction to rinforcement learning (2nd Edition, in preparation). MIT Press"},{"key":"1417_CR33","unstructured":"Sutton RS (2016) Tile coding software \u2013 reference manual, version 3 beta. http:\/\/incompleteideas.net\/tiles\/tiles3.html"},{"key":"1417_CR34","doi-asserted-by":"crossref","unstructured":"Van Seijen H, Van Hasselt H, Whiteson S, Wiering M (2009) A theoretical and empirical analysis of expected sarsa. In: Proceedings of the IEEE symposium on adaptive dynamic programming reinforcement learning, pp 177\u2013184","DOI":"10.1109\/ADPRL.2009.4927542"},{"key":"1417_CR35","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.ins.2013.08.037","volume":"261","author":"X Xu","year":"2014","unstructured":"Xu X, Zuo L, Huang Z (2014) Reinforcement learning algorithms with function approximation: recent advances and applications. Inf Sci 261:1\u201331","journal-title":"Inf Sci"},{"key":"1417_CR36","unstructured":"Zhao X, Ding S, An Y (2018) A new asynchronous architecture for tabular reinforcement learning algorithms. In: Proceedings of the 8th international conference on extreme learning machines, pp 172\u2013180"},{"issue":"12","key":"1417_CR37","doi-asserted-by":"publisher","first-page":"4889","DOI":"10.1007\/s10489-018-1241-z","volume":"48","author":"X Zhao","year":"2018","unstructured":"Zhao X, Ding S, An Y, Jia W (2018) Asynchronous reinforcement learning algorithms for solving discrete space path planning problems. Appl Intell 48(12):4889\u20134904","journal-title":"Appl Intell"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10489-019-01417-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-019-01417-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-019-01417-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,9,11]],"date-time":"2022-09-11T17:24:18Z","timestamp":1662917058000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10489-019-01417-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,2,9]]},"references-count":37,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2019,8]]}},"alternative-id":["1417"],"URL":"https:\/\/doi.org\/10.1007\/s10489-019-01417-4","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"value":"0924-669X","type":"print"},{"value":"1573-7497","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019,2,9]]},"assertion":[{"value":"9 February 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}