{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T11:31:47Z","timestamp":1775561507785,"version":"3.50.1"},"reference-count":64,"publisher":"Springer Science and Business Media LLC","issue":"S27","license":[{"start":{"date-parts":[[2021,5,19]],"date-time":"2021-05-19T00:00:00Z","timestamp":1621382400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,5,19]],"date-time":"2021-05-19T00:00:00Z","timestamp":1621382400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Synthese"],"published-print":{"date-parts":[[2021,11]]},"DOI":"10.1007\/s11229-021-03141-4","type":"journal-article","created":{"date-parts":[[2021,5,19]],"date-time":"2021-05-19T16:06:06Z","timestamp":1621440366000},"page":"6435-6467","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":30,"title":["Reward tampering problems and solutions in reinforcement learning: a causal influence diagram perspective"],"prefix":"10.1007","volume":"198","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1210-9866","authenticated-orcid":false,"given":"Tom","family":"Everitt","sequence":"first","affiliation":[]},{"given":"Marcus","family":"Hutter","sequence":"additional","affiliation":[]},{"given":"Ramana","family":"Kumar","sequence":"additional","affiliation":[]},{"given":"Victoria","family":"Krakovna","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,5,19]]},"reference":[{"key":"3141_CR1","unstructured":"Amodei, D., Olah, C., Steinhardt, J., Christiano, P., Schulman, J., & Mane, D. (2016). Concrete problems in AI Safety. arXiv: 1606.06565."},{"key":"3141_CR2","unstructured":"Armstrong, S., & O\u2019Rourke, X. (2017). \u2018Indifference\u2019 methods for managing agent rewards. arXiv: 1712.06365."},{"key":"3141_CR3","doi-asserted-by":"crossref","unstructured":"Armstrong, S., Orseau, L., Leike, J., & Legg, S. (2020). Pitfalls in learning a reward function online. In IJCAI. arXiv: 2004.13654.","DOI":"10.24963\/ijcai.2020\/221"},{"key":"3141_CR4","unstructured":"Balke, A., & Pearl, J. (1994). Probabilistic evaluation of counterfactual queries. In AAAI (pp. 230\u2013237)."},{"key":"3141_CR5","unstructured":"Bostrom, N. (2014). Superintelligence: Paths, dangers, strategies. Oxford University Press."},{"key":"3141_CR6","doi-asserted-by":"crossref","unstructured":"Carey, R. (2018). Incorrigibility in the CIRL framework. In AAAI\/ACM conference on artificial intelligence, ethics and society. Machine Intelligence Research Institute.","DOI":"10.1145\/3278721.3278750"},{"key":"3141_CR7","unstructured":"Christiano, P., Leike, J., Brown, T. B., Martic, M., Legg, S., Amodei, D. (2017). Deep reinforcement learning from human preferences. In Advances in neural information processing systems (pp. 4302\u20134310). arXiv: 1706.03741."},{"key":"3141_CR8","unstructured":"Christiano, P., Shlegeris, B., & Amodei, D. (2018). Supervising strong learners by amplifying weak experts. arXiv: 1810.08575."},{"key":"3141_CR9","unstructured":"Demski, A., & Garrabrant, S. (2019). Embedded agency. arXiv: 1902.09469."},{"key":"3141_CR10","unstructured":"Dennett, D. C. (2017). From bacteria to bach and back: The evolution of minds (p. 0393355500). W. W. Norton & Company."},{"key":"3141_CR11","doi-asserted-by":"crossref","unstructured":"Dewey, D. (2011). Learning what to Value. In Artificial general intelligence (Vol. 6830, pp. 309\u2013314). isbn: 978-3-642-22886-5. arXiv: 1402.5379. http:\/\/www.springerlink.com\/index\/10.1007\/978-3-642-22887-2.","DOI":"10.1007\/978-3-642-22887-2_35"},{"key":"3141_CR12","unstructured":"Everitt, T. (2018). Towards safe artificial general intelligence. PhD thesis. Austrailan National University. http:\/\/hdl.handle.net\/1885\/164227."},{"key":"3141_CR13","unstructured":"Everitt, T., Carey, R., Langlois, E., Ortega, P. A. & Legg, S. (2021). Agent incentives: A causal perspective. In AAAI. arXiv: 2102.01685."},{"key":"3141_CR14","doi-asserted-by":"crossref","unstructured":"Everitt, T., Filan, D., Daswani, M., & Hutter, M. (2016). Self-modification of policy and utility function in rational agents. In Artificial general intelligence (pp. 1\u201311). ISBN: 9783319416489. arXiv: 1605.03142.","DOI":"10.1007\/978-3-319-41649-6_1"},{"key":"3141_CR15","doi-asserted-by":"crossref","unstructured":"Everitt, T., Krakovna, V., Orseau, L., Hutter, M., & Legg, S. (2017). Reinforcement learning with corrupted reward signal. In IJCAI international joint conference on artificial intelligence (pp. 4705\u20134713). arXiv: 1705.08417.","DOI":"10.24963\/ijcai.2017\/656"},{"key":"3141_CR16","doi-asserted-by":"crossref","unstructured":"Everitt, T., Lea, G., & Hutter, M. (2018). AGI safety literature review. In International joint conference on artificial intelligence (IJCAI). arXiv: 1805.01109.","DOI":"10.24963\/ijcai.2018\/768"},{"key":"3141_CR17","unstructured":"Everitt, T., Ortega, P. A., Barnes, E., & Legg, S. (2019). Understanding agent incentives using causal influence diagrams. Part I: Single action settings. arXiv:1902.09980."},{"key":"3141_CR18","unstructured":"Freedman, R., Shah, R., & Dragan, A. (2020). Choice set misspecification in reward inference. In IJCAI AI safety workshop."},{"key":"3141_CR19","doi-asserted-by":"crossref","unstructured":"Gabriel, I. (2020). Artificial intelligence, values and alignment. In Minds and machines (Vol. 30, pp. 411\u2013437). arXiv: 2001.09768.","DOI":"10.1007\/s11023-020-09539-2"},{"key":"3141_CR20","unstructured":"Hadfield-Menell, D., Dragan, A., Abbeel, P., & Russell, S. J. (2016). Cooperative inverse reinforcement learning. In Advances in neural information processing systems (pp. 3909\u20133917). arXiv: 1606.03137."},{"key":"3141_CR21","unstructured":"Hadfield-Menell, D., Milli, S., Abbeel, P., Russell, S. J., & Dragan, A. (2017). Inverse reward design. In Advances in neural information processing systems (pp. 6768\u20136777). arXiv: 1711.02827."},{"issue":"1","key":"3141_CR22","doi-asserted-by":"publisher","first-page":"1","DOI":"10.2478\/v10229-011-0013-5","volume":"3","author":"Bill Hibbard","year":"2012","unstructured":"Hibbard, Bill. (2012). Model-based utility functions. Journal of Artificial General Intelligence, 3(1), 1\u201324. arXiv: 1111.3934.","journal-title":"Journal of Artificial General Intelligence"},{"key":"3141_CR23","unstructured":"Howard, R. A., & Matheson, J. E. (1984). Influence diagrams. In Readings on the principles and applications of decision analysis (pp. 721\u2013762)."},{"key":"3141_CR24","unstructured":"Hubinger, E., van Merwijk, C., Mikulik, V., Skalse, J., & Garrabrant, S. (2019). Risks from Learned Optimization. In Advanced machine learning systems. arXiv: 1906.01820."},{"key":"3141_CR25","unstructured":"Jeon, H. J., Milli, S., & Dragan, A. D. (2020). Reward-rational (implicit) choice: A unifying formalism for reward learning. arXiv: 2002.04833."},{"issue":"1\u20132","key":"3141_CR26","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1016\/S0004-3702(98)00023-X","volume":"101","author":"Leslie Pack Kaelbling","year":"1998","unstructured":"Kaelbling, Leslie Pack, Littman, Michael L., & Cassandra, Anthony R. (1998). Planning and acting in partially observable stochastic domains. Artificial Intelligence, 101(1\u20132), 99\u2013134.","journal-title":"Artificial Intelligence"},{"key":"3141_CR27","doi-asserted-by":"crossref","unstructured":"Knox, W. B., & Stone, P. (2009). Interactively shaping agents via human reinforcement. In Proceedings of the fifth international conference on Knowledge capture \u2014K-CAP \u201909 September (p. 9).","DOI":"10.1145\/1597735.1597738"},{"issue":"1","key":"3141_CR28","doi-asserted-by":"publisher","first-page":"181","DOI":"10.1016\/S0899-8256(02)00544-4","volume":"45","author":"Daphne Koller","year":"2003","unstructured":"Koller, Daphne, & Milch, Brian. (2003). Multi-agent influence diagrams for representing and solving games. Games and Economic Behavior, 45(1), 181\u2013221.","journal-title":"Games and Economic Behavior"},{"key":"3141_CR29","unstructured":"Krakovna, V., Uesato, J., Mikulik, V., Rahtz, M., Everitt, T., Kumar, R., Kenton, Z., Leike, J., & Legg, S. (2020). Specification gaming: The flip side of AI ingenuity. https:\/\/deepmind.com\/blog\/article\/Specification-gaming-the-flip-side-of-AI-ingenuity (visited on 07\/16\/2020)."},{"key":"3141_CR30","unstructured":"Kumar, R., Uesato, J., Ngo, R., Everitt, T., Krakovna, V., & Legg, S. (2020). REALab: An embedded perspective on tampering.arXiv: 2011.08820."},{"key":"3141_CR31","unstructured":"Langlois, E., & Everitt, T. (2021). How RL agents behave when their actions are modified. In AAAI. arXiv: 2102.07716."},{"key":"3141_CR32","doi-asserted-by":"publisher","first-page":"140","DOI":"10.1016\/j.tcs.2013.09.022","volume":"519","author":"T Lattimore","year":"2014","unstructured":"Lattimore, T., & Hutter, M. (2014). General time consistent discounting. Theoretical Computer Science, 519, 140\u2013154. arXiv: 1107.5528.","journal-title":"Theoretical Computer Science"},{"issue":"9","key":"3141_CR33","doi-asserted-by":"publisher","first-page":"1235","DOI":"10.1287\/mnsc.47.9.1235.9779","volume":"47","author":"SL Lauritzen","year":"2001","unstructured":"Lauritzen, S. L., & Nilsson, D. (2001). Representing and solving decision problems with limited information. Management Science, 47(9), 1235\u20131251.","journal-title":"Management Science"},{"key":"3141_CR34","unstructured":"LaVictoire, P., Fallenstein, B., Yudkowsky, E. S., Barasz, M., Christiano, P., & Herreshoff, M. (2014). Program equilibrium in the prisoner\u2019s dilemma via L\u00f6b\u2019s Theorem\u201d. In AAAI workshop on multiagent interaction without prior coordination."},{"key":"3141_CR35","unstructured":"Lehman, J., Clune, J., Misevic, D., Adami, C., Altenberg, L., Beaulieu, J., Bentley, P. J., Bernard, S., Beslon, G., Bryson, D. M., Chrabaszcz, P., Cheney, N., Cully, A., Doncieux, S., Dyer, F. C., Ellefsen, K. O., Feldt, R., Fischer, S., Forrest, S., ..., Yosinski, J. (2018). The surprising creativity of digital evolution: A collection of anecdotes from the evolutionary computation and artificial life research communities. arXiv: 1803.03453."},{"key":"3141_CR36","unstructured":"Leike, J., Krueger, D., Everitt, T., Martic, M., Maini, V., & Legg, S. (2018). Scalable agent alignment via reward modeling: A research direction. arXiv: 1811.07871."},{"key":"3141_CR37","unstructured":"Leike, J., Martic, M., Krakovna, V., Ortega, P. A., Everitt, T., Lefrancq, A., Orseau, L., & Legg, S. (2017). AI safety gridworlds. arXiv: 1711.09883."},{"key":"3141_CR38","unstructured":"Levine, S., Kumar, A., Tucker, G., & Fu, J. (2020). Offline reinforcement learning: Tutorial, review, and perspectives on open problems. arXiv: 2005.01643."},{"key":"3141_CR39","unstructured":"Masterjun. (2014). SNES Super Mario World (USA) \u201carbitrary code execution\u201d.http:\/\/tasvideos.org\/2513M.html (visited on 01\/23\/2019)."},{"key":"3141_CR40","doi-asserted-by":"crossref","unstructured":"Milli, S., Belli, L., & Hardt, M. (2020). From optimizing engagement to measuring value. In FAccT. arXiv: 2008.12623.","DOI":"10.1145\/3442188.3445933"},{"key":"3141_CR41","doi-asserted-by":"crossref","unstructured":"Milli, S., Hadfield-Menell, D., Dragan, A., & Russell, S. J. (2017). Should robots be obedient? In IJCAI (pp. 4754\u20134760). ISBN: 9780999241103. arXiv: 1705.09990.","DOI":"10.24963\/ijcai.2017\/662"},{"key":"3141_CR42","unstructured":"Ng, A. Y. & Russell, S. J. (2000). Algorithms for inverse reinforcement learning. In Proceedings of the seventeenth international conference on machine learning (pp. 663\u2013670)."},{"issue":"6","key":"3141_CR43","doi-asserted-by":"publisher","first-page":"419","DOI":"10.1037\/h0058775","volume":"47","author":"J Olds","year":"1954","unstructured":"Olds, J., & Milner, P. (1954). Positive reinforcement produced by electrical stimulation of septal area and other regions of rat brain. Journal of Comparative and Physiological Psychology, 47(6), 419\u2013427.","journal-title":"Journal of Comparative and Physiological Psychology"},{"key":"3141_CR44","unstructured":"Omohundro, S. M. (2008). The basic AI drives. In P. Wang, B. Goertzel, & S. Franklin (Eds.)Artificial general intelligence (Vol. 171, pp. 483\u2013493). IOS Press."},{"key":"3141_CR45","unstructured":"Orseau, L. & Armstrong, S. (2016). Safely interruptible agents. In 32nd conference on uncertainty in artificial intelligence."},{"key":"3141_CR46","doi-asserted-by":"crossref","unstructured":"Orseau, L. & Ring, M. (2011). Self-modification and mortality in artificial agents. In Artificial general intelligence (Vol. 6830, pp. 1\u201310)","DOI":"10.1007\/978-3-642-22887-2_1"},{"key":"3141_CR47","doi-asserted-by":"crossref","unstructured":"Pearl, J. (2009). Causality: Models, reasoning, and inference (2nd edn). Cambridge University Press. ISBN: 9780521895606.","DOI":"10.1017\/CBO9780511803161"},{"key":"3141_CR48","doi-asserted-by":"crossref","unstructured":"Petersen, S. (2021). Machines learning values. In Ethics of artificial intelligence. Oxford University Press.","DOI":"10.1093\/oso\/9780190905033.003.0015"},{"key":"3141_CR49","doi-asserted-by":"crossref","unstructured":"Portenoy, R. K, Jarden, J. O., Sidtis, J. J., Lipton, R. B., Foley, K. M., & Rottenberg, D. A. (1986). Compulsive thalamic self-stimulation: A case with metabolic, electrophysiologic and behavioral correlates. In Pain. 27.3.","DOI":"10.1016\/0304-3959(86)90155-7"},{"key":"3141_CR50","unstructured":"Reddy, S., Dragan, A. D., Levine, S., Legg, S., & Leike, J. (2020). Learning human objectives by evaluating hypothetical behavior. In ICML. arXiv: 1912.05652."},{"key":"3141_CR51","doi-asserted-by":"crossref","unstructured":"Ring, M., & Orseau, L. (2011). Delusion, survival, and intelligent agents. In Artificial general intelligence (pp. 1\u201311). Springer.","DOI":"10.1007\/978-3-642-22887-2_2"},{"key":"3141_CR52","unstructured":"Russell, S. J (2019). Stuart J. Russell on filter bubbles and the future of artificial intelligence. https:\/\/www.youtube.com\/watch?v=ZkV7anCPfaY (visited on 06\/15\/2020)."},{"key":"3141_CR53","unstructured":"Schmidhuber, J. (2007). G\u00f6del machines: Self-referential universal problem solvers making provably optimal self-improvements. In Artificial general intelligence. Springer. arXiv: 0309048 [cs]."},{"key":"3141_CR54","doi-asserted-by":"crossref","unstructured":"Schrittwieser, J., Antonoglou, I., Hubert, T., Simonyan, K., Sifre, L., Schmitt, S., Guez, A., Lockhart, E., Hassabis, D., Graepel, T., Lillicrap, T., & Silver, D. (2019). Mastering atari, go, chess and shogi by planning with a learned model.arXiv: 1911.08265.","DOI":"10.1038\/s41586-020-03051-4"},{"key":"3141_CR55","unstructured":"Shah, R., Krasheninnikov, D., Alexander, J., Abbeel, P., & Dragan, A. D. (2019). Preferences implicit in the state of the world. In 7th international conference on learning representations, ICLR. arXiv: 1902.04198."},{"key":"3141_CR56","unstructured":"Shpitser, I. & Pearl , J. (2007). What counterfactuals can be tested. In Proceedings of the 23rd conference on uncertainty in artificial intelligence (pp. 352\u2013359)."},{"key":"3141_CR57","first-page":"1941","volume":"9","author":"I Shpitser","year":"2008","unstructured":"Shpitser, I., & Pearl, J. (2008). Complete identification methods for the causal hierarchy. Journal of Machine Learning Research, 9, 1941\u20131979.","journal-title":"Journal of Machine Learning Research"},{"key":"3141_CR58","unstructured":"Soares, N., Fallenstein, B., Yudkowsky, E. S. & Armstrong, S. (2015). Corrigibility. In AAAI workshop on AI and ethics (pp. 74\u201382)."},{"key":"3141_CR59","unstructured":"Sutton, R. S. & Barto , A. G. (2018). Reinforcement learning: An introduction (2nd edn). MIT Press. ISBN: 9780262039246."},{"key":"3141_CR60","doi-asserted-by":"crossref","unstructured":"Turner, A. M., Hadfield-Menell, D., & Tadepalli, P. (2020). Conservative agency via attainable utility preservation. In AI, ethics, and society. arXiv: 1902.09725.","DOI":"10.1145\/3375627.3375851"},{"key":"3141_CR61","unstructured":"Uesato, J., Kumar, R., Krakovna, V., Everitt, T., Ngo, R., Legg, S. (2020). Avoiding tampering incentives in deep RL via decoupled approval. arXiv: 2011.08827."},{"key":"3141_CR62","unstructured":"Vaughanbell. (2008). Erotic self-stimulation and brain implants. https:\/\/mindhacks.com\/2008\/09\/16\/erotic-self-stimulation-and-brain-implants\/ (visited on 02\/08\/2018)."},{"key":"3141_CR63","doi-asserted-by":"crossref","unstructured":"Yampolskiy, R. V. (2015). Artificial superintelligence: A futuristic approach (p. 227). Chapman and Hall\/CRC. ISBN: 978-1482234435.","DOI":"10.1201\/b18612"},{"key":"3141_CR64","unstructured":"Yudkowsky, E. S. (2008). Hard takeoff. http:\/\/lesswrong.com\/lw\/wf\/hard%7B%5C%7Dtakeoff\/ (visited on 01\/12\/2018)."}],"container-title":["Synthese"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11229-021-03141-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11229-021-03141-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11229-021-03141-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,11,11]],"date-time":"2021-11-11T11:38:00Z","timestamp":1636630680000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11229-021-03141-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,5,19]]},"references-count":64,"journal-issue":{"issue":"S27","published-print":{"date-parts":[[2021,11]]}},"alternative-id":["3141"],"URL":"https:\/\/doi.org\/10.1007\/s11229-021-03141-4","relation":{},"ISSN":["0039-7857","1573-0964"],"issn-type":[{"value":"0039-7857","type":"print"},{"value":"1573-0964","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,5,19]]},"assertion":[{"value":"31 March 2018","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 March 2021","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 May 2021","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}