{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,25]],"date-time":"2026-02-25T10:15:02Z","timestamp":1772014502021,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":27,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,2,7]],"date-time":"2020-02-07T00:00:00Z","timestamp":1581033600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Long-Term Future Fund"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,2,7]]},"DOI":"10.1145\/3375627.3375851","type":"proceedings-article","created":{"date-parts":[[2020,2,5]],"date-time":"2020-02-05T01:10:22Z","timestamp":1580865022000},"page":"385-391","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":11,"title":["Conservative Agency via Attainable Utility Preservation"],"prefix":"10.1145","author":[{"given":"Alexander Matt","family":"Turner","sequence":"first","affiliation":[{"name":"Oregon State University, Corvallis, OR, USA"}]},{"given":"Dylan","family":"Hadfield-Menell","sequence":"additional","affiliation":[{"name":"University of California, Berkeley, Berkeley, CA, USA"}]},{"given":"Prasad","family":"Tadepalli","sequence":"additional","affiliation":[{"name":"Oregon State University, Corvallis, OR, USA"}]}],"member":"320","published-online":{"date-parts":[[2020,2,7]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Constrained Markov decision processes","author":"Altman Eitan","year":"1999","unstructured":"Eitan Altman. Constrained Markov decision processes, volume 7. CRC Press, 1999."},{"key":"e_1_3_2_1_2_1","volume-title":"June","author":"Amodei Dario","year":"2016","unstructured":"Dario Amodei, Chris Olah, Jacob Steinhardt, Paul Christiano, John Schulman, and Dan Mane. Concrete problems in AI safety. arXiv:1606.06565 [cs], June 2016. arXiv: 1606.06565."},{"key":"e_1_3_2_1_3_1","volume-title":"May","author":"Armstrong Stuart","year":"2017","unstructured":"Stuart Armstrong and Benjamin Levinstein. Low impact artificial intelligences. arXiv:1705.10720 [cs], May 2017. arXiv: 1705.10720."},{"key":"e_1_3_2_1_4_1","first-page":"908","volume-title":"Advances in Neural Information Processing Systems","author":"Berkenkamp Felix","year":"2017","unstructured":"Felix Berkenkamp, Matteo Turchetta, Angela Schoellig, and Andreas Krause. Safe model-based reinforcement learning with stability guarantees. In Advances in Neural Information Processing Systems, pages 908--918, 2017."},{"key":"e_1_3_2_1_5_1","volume-title":"Incorrigibility in the CIRL framework. AI, Ethics, and Society","author":"Carey Ryan","year":"2018","unstructured":"Ryan Carey. Incorrigibility in the CIRL framework. AI, Ethics, and Society, 2018."},{"key":"e_1_3_2_1_6_1","first-page":"8092","volume-title":"Advances in Neural Information Processing Systems","author":"Chow Yinlam","year":"2018","unstructured":"Yinlam Chow, Ofir Nachum, Edgar Duenez-Guzman, and Mohammad Ghavamzadeh. A lyapunov-based approach to safe reinforcement learning. In Advances in Neural Information Processing Systems, pages 8092--8101, 2018."},{"key":"e_1_3_2_1_7_1","first-page":"4299","volume-title":"Advances in Neural Information Processing Systems","author":"Christiano Paul F","year":"2017","unstructured":"Paul F Christiano, Jan Leike, Tom Brown, Miljan Martic, Shane Legg, and Dario Amodei. Deep reinforcement learning from human preferences. In Advances in Neural Information Processing Systems, pages 4299--4307, 2017."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2017\/656"},{"key":"e_1_3_2_1_9_1","volume-title":"International Conference on Learning Representations","author":"Eysenbach Benjamin","year":"2018","unstructured":"Benjamin Eysenbach, Shixiang Gu, Julian Ibarz, and Sergey Levine. Leave no trace: Learning to reset for safe and autonomous reinforcement learning. In International Conference on Learning Representations, 2018."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.5555\/2789272.2886795"},{"key":"e_1_3_2_1_11_1","first-page":"3909","volume-title":"Advances in Neural Information Processing Systems","author":"Hadfield-Menell Dylan","year":"2016","unstructured":"Dylan Hadfield-Menell, Stuart Russell, Pieter Abbeel, and Anca Dragan. Cooperative inverse reinforcement learning. In Advances in Neural Information Processing Systems, pages 3909--3917, 2016."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2017\/32"},{"key":"e_1_3_2_1_13_1","first-page":"6765","volume-title":"Advances in Neural Information Processing Systems","author":"Hadfield-Menell Dylan","year":"2017","unstructured":"Dylan Hadfield-Menell, Smitha Milli, Pieter Abbeel, Stuart Russell, and Anca Dragan. Inverse reward design. In Advances in Neural Information Processing Systems, pages 6765--6774, 2017."},{"key":"e_1_3_2_1_14_1","volume-title":"June","author":"Krakovna Victoria","year":"2018","unstructured":"Victoria Krakovna, Laurent Orseau, Miljan Martic, and Shane Legg. Measuring and avoiding side effects using relative reachability. arXiv:1806.01186 [cs, stat], June 2018. arXiv: 1806.01186."},{"key":"e_1_3_2_1_15_1","volume-title":"Preventing side-effects in gridworlds","author":"Leech Gavin","year":"2018","unstructured":"Gavin Leech, Karol Kubicki, Jessica Cooper, and Tom McGrath. Preventing side-effects in gridworlds, 2018."},{"key":"e_1_3_2_1_16_1","volume-title":"November","author":"Leike Jan","year":"2017","unstructured":"Jan Leike, Miljan Martic, Victoria Krakovna, Pedro Ortega, Tom Everitt, Andrew Lefrancq, Laurent Orseau, and Shane Legg. AI safety gridworlds. arXiv:1711.09883 [cs], November 2017. arXiv: 1711.09883."},{"key":"e_1_3_2_1_17_1","first-page":"2125","volume-title":"Advances in Neural Information Processing Systems","author":"Mohamed Shakir","year":"2015","unstructured":"Shakir Mohamed and Danilo Jimenez Rezende. Variational information maximisation for intrinsically motivated reinforcement learning. In Advances in Neural Information Processing Systems, pages 2125--2133, 2015."},{"key":"e_1_3_2_1_18_1","volume-title":"ICML","author":"Moldovan Teodor Mihai","year":"2012","unstructured":"Teodor Mihai Moldovan and Pieter Abbeel. Safe exploration in Markov decision processes. ICML, 2012."},{"key":"e_1_3_2_1_19_1","volume-title":"https:\/\/blog.openai.com\/openai-five\/","author":"Five AI.","year":"2018","unstructured":"OpenAI. OpenAI Five. https:\/\/blog.openai.com\/openai-five\/, 2018."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-13823-7_31"},{"key":"e_1_3_2_1_21_1","volume-title":"AAAI","author":"Regan Kevin","year":"2010","unstructured":"Kevin Regan and Craig Boutilier. Robust policy computation in reward-uncertain MDPs using nondominated policies. In AAAI, 2010."},{"key":"e_1_3_2_1_22_1","first-page":"2067","volume-title":"Proceedings of the 17th International Conference on Autonomous Agents and Multi-Agent Systems","author":"Saunders William","year":"2018","unstructured":"William Saunders, Girish Sastry, Andreas Stuhlmueller, and Owain Evans. Trial without error: Towards safe reinforcement learning via human intervention. In Proceedings of the 17th International Conference on Autonomous Agents and Multi-Agent Systems, pages 2067--2069, 2018."},{"key":"e_1_3_2_1_23_1","volume-title":"International Conference on Learning Representations","author":"Shah Rohin","year":"2019","unstructured":"Rohin Shah, Dmitrii Krasheninnikov, Jordan Alexander, Pieter Abbeel, and Anca Dragan. The implicit preference information in an initial state. In International Conference on Learning Representations, 2019."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1038\/nature16961"},{"key":"e_1_3_2_1_25_1","volume-title":"Eliezer Yudkowsky. Corrigibility. AAAI Workshops","author":"Soares Nate","year":"2015","unstructured":"Nate Soares, Benja Fallenstein, Stuart Armstrong, and Eliezer Yudkowsky. Corrigibility. AAAI Workshops, 2015."},{"key":"e_1_3_2_1_26_1","first-page":"292","author":"Peter Dayan ChristopherWatkins","year":"1992","unstructured":"ChristopherWatkins and Peter Dayan. Q-learning. Machine Learning, 8(3--4):279-- 292, 1992.","journal-title":"Q-learning. Machine Learning, 8(3--4):279--"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/676"}],"event":{"name":"AIES '20: AAAI\/ACM Conference on AI, Ethics, and Society","location":"New York NY USA","acronym":"AIES '20","sponsor":["SIGAI ACM Special Interest Group on Artificial Intelligence"]},"container-title":["Proceedings of the AAAI\/ACM Conference on AI, Ethics, and Society"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3375627.3375851","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3375627.3375851","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T22:38:14Z","timestamp":1750199894000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3375627.3375851"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,2,7]]},"references-count":27,"alternative-id":["10.1145\/3375627.3375851","10.1145\/3375627"],"URL":"https:\/\/doi.org\/10.1145\/3375627.3375851","relation":{},"subject":[],"published":{"date-parts":[[2020,2,7]]},"assertion":[{"value":"2020-02-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}