{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,15]],"date-time":"2026-04-15T18:33:43Z","timestamp":1776278023458,"version":"3.50.1"},"reference-count":257,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"12","license":[{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62372329"],"award-info":[{"award-number":["62372329"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2024,12]]},"DOI":"10.1109\/tpami.2024.3457538","type":"journal-article","created":{"date-parts":[[2024,9,10]],"date-time":"2024-09-10T19:09:35Z","timestamp":1725995375000},"page":"11216-11235","source":"Crossref","is-referenced-by-count":189,"title":["A Review of Safe Reinforcement Learning: Methods, Theories, and Applications"],"prefix":"10.1109","volume":"46","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2722-3779","authenticated-orcid":false,"given":"Shangding","family":"Gu","sequence":"first","affiliation":[{"name":"University of California, Berkeley, CA, USA"}]},{"given":"Long","family":"Yang","sequence":"additional","affiliation":[{"name":"Institute for AI, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5683-2621","authenticated-orcid":false,"given":"Yali","family":"Du","sequence":"additional","affiliation":[{"name":"Department of Informatics, King&#x2019;s College London, London, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7416-592X","authenticated-orcid":false,"given":"Guang","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tongji University, Shanghai, China"}]},{"given":"Florian","family":"Walter","sequence":"additional","affiliation":[{"name":"Department of Informatics, Technical University of Munich, Munich, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4021-4228","authenticated-orcid":false,"given":"Jun","family":"Wang","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University College London, London, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4840-076X","authenticated-orcid":false,"given":"Alois","family":"Knoll","sequence":"additional","affiliation":[{"name":"Department of Informatics, Technical University of Munich, Munich, Germany"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1016\/j.tre.2021.102496"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i1.16156"},{"key":"ref3","first-page":"22","article-title":"Constrained policy optimization","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Achiam"},{"key":"ref4","volume-title":"Oxford Dictionary of English","author":"Stevenson","year":"2010"},{"key":"ref5","first-page":"143","article-title":"Safe exploration for reinforcement learning","volume-title":"Proc. 16th Eur. Symp. Artif. Neural Netw.","author":"Hans"},{"key":"ref6","first-page":"3916","article-title":"Cooperative inverse reinforcement learning","volume-title":"Proc. 30th Int. Conf. Neural Inf. Process. Syst.","author":"Hadfield-Menell"},{"key":"ref7","article-title":"AI safety via debate","author":"Irving","year":"2018"},{"key":"ref8","article-title":"Scalable agent alignment via reward modeling: A research direction","author":"Leike","year":"2018"},{"key":"ref9","article-title":"Concrete problems in AI safety","author":"Amodei","year":"2016"},{"key":"ref10","first-page":"177","article-title":"Mean-variance optimization in Markov decision processes","volume-title":"Proc. 28th Int. Conf. Mach. Learn.","author":"Mannor"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/1273496.1273525"},{"key":"ref12","first-page":"3140","article-title":"Risk aversion in Markov decision processes via near optimal Chernoff bounds","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Moldovan"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-021-05961-4"},{"key":"ref14","article-title":"A primal-dual approach to constrained Markov decision processes","author":"Chen","year":"2021"},{"key":"ref15","first-page":"3703","article-title":"Batch policy learning under constraints","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Le"},{"key":"ref16","article-title":"Accelerated primal-dual policy optimization for safe reinforcement learning","author":"Liang","year":"2018"},{"key":"ref17","first-page":"7553","article-title":"Constrained reinforcement learning has zero duality gap","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Paternain"},{"key":"ref18","article-title":"Robust constrained-MDPs: Soft-constrained robust policy optimization under model uncertainty","author":"Russel","year":"2020"},{"key":"ref19","first-page":"8502","article-title":"Constrained Markov decision processes via backward value functions","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Satija"},{"key":"ref20","article-title":"Reward constrained policy optimization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Tessler"},{"key":"ref21","article-title":"A primal approach to constrained policy optimization: Global optimality and finite-time analysis","author":"Xu","year":"2020"},{"key":"ref22","article-title":"Conservative safety critics for exploration","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Bharadhwaj"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/632"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1016\/j.automatica.2021.109689"},{"key":"ref25","article-title":"Supervised policy update for deep reinforcement learning","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Vuong"},{"key":"ref26","article-title":"Projection-based constrained policy optimization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Yang"},{"key":"ref27","first-page":"6172","article-title":"Neurosymbolic reinforcement learning with formally verified exploration","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Anderson"},{"key":"ref28","article-title":"Safety verification of autonomous systems: A multi-fidelity reinforcement learning approach","author":"Beard","year":"2022"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12107"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3447928.3456653"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.5220\/0010258102370245"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3314221.3314638"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2020.XVI.088"},{"key":"ref34","first-page":"8103","article-title":"A Lyapunov-based approach to safe reinforcement learning","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Chow"},{"key":"ref35","article-title":"Lyapunov-based safe policy optimization for continuous control","volume-title":"Proc. Int. Conf. Mach. Learn. Workshop RL4RealLife","author":"Chow"},{"key":"ref36","article-title":"Safe reinforcement learning for probabilistic reachability and safety specifications: A Lyapunov-based approach","author":"Huh","year":"2020"},{"key":"ref37","article-title":"Lyapunov-based uncertainty-aware safe reinforcement learning","author":"Jeddi","year":"2021"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CDC.2014.7039601"},{"key":"ref39","article-title":"Safe-critical modular deep reinforcement learning with temporal logic through Gaussian processes and control barrier functions","author":"Cai","year":"2021"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-021-06103-6"},{"key":"ref41","article-title":"Safety-guided deep reinforcement learning via online Gaussian process estimation","author":"Fan","year":"2019"},{"key":"ref42","first-page":"1565","article-title":"Safe policy search using Gaussian process models","volume-title":"Proc. 18th Int. Conf. Auton. Agents MultiAgent Syst.","author":"Polymenakos"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1016\/j.ifacol.2021.08.292"},{"key":"ref44","first-page":"997","article-title":"Safe exploration for optimization with Gaussian processes","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Sui"},{"key":"ref45","first-page":"4305","article-title":"Safe exploration in finite Markov decision processes with Gaussian processes","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Turchetta"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12103"},{"key":"ref47","volume-title":"Reinforcement Learning: An Introduction","author":"Sutton","year":"2018"},{"key":"ref48","first-page":"3304","article-title":"Provably efficient safe exploration via primal-dual policy optimization","volume-title":"Proc. Int. Conf. Artif. Intell. Stat.","author":"Ding"},{"key":"ref49","first-page":"22288","article-title":"Nearly minimax optimal reinforcement learning for discounted MDPs","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"He"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i9.16979"},{"key":"ref51","article-title":"A simple reward-free approach to constrained reinforcement learning","author":"Miryoosefi","year":"2021"},{"key":"ref52","article-title":"Natural policy gradient primal-dual method for constrained Markov decision processes","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Ding"},{"key":"ref53","first-page":"11 480","article-title":"CRPO: A new approach for safe reinforcement learning with convergence guarantee","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Xu"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/cdc51059.2022.9992837"},{"key":"ref55","article-title":"Unsolved problems in ML safety","author":"Hendrycks","year":"2021"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.3390\/robotics11040081"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2018.8593420"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC45102.2020.9294259"},{"key":"ref59","article-title":"Deep constrained Q-learning","author":"Kalweit","year":"2020"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8793742"},{"key":"ref61","first-page":"32","article-title":"Reinforcement learning for autonomous maneuvering in highway scenarios","volume-title":"Proc. Workshop Driving Assistance Syst. Auton. Driving","author":"Mirchevska"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC.2018.8569448"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3152313"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1613\/jair.3761"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2019.103360"},{"issue":"Dec.","key":"ref66","first-page":"803","article-title":"Lyapunov design for safe reinforcement learning","volume":"3","author":"Perkins","year":"2002","journal-title":"J. Mach. Learn. Res."},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8460547"},{"key":"ref68","article-title":"SAFER: Data-efficient and safe reinforcement learning via skill acquisition","author":"Slack","year":"2022"},{"key":"ref69","first-page":"13859","article-title":"Safe reinforcement learning by imagining the near future","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Thomas"},{"key":"ref70","article-title":"MuZero with self-competition for rate control in VP9 video compression","author":"Mandhane","year":"2022"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/TSG.2019.2955437"},{"key":"ref72","article-title":"AI safety gridworlds","author":"Leike","year":"2017"},{"key":"ref73","article-title":"Benchmarking safe exploration in deep reinforcement learning","author":"Ray","year":"2019"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1016\/j.artint.2023.103905"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.5840\/monist197659224"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2024.3457538"},{"key":"ref77","volume-title":"Constrained Markov Decision Processes","author":"Altman","year":"1999"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1016\/0022-247X(85)90288-4"},{"key":"ref79","article-title":"Linear programming and finite Markovian control problems","volume":"148","author":"Kallenberg","year":"1983","journal-title":"Math. Centre Tract"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1287\/opre.37.3.474"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1287\/opre.37.5.780"},{"key":"ref82","volume-title":"Markov Decision Processes: Discrete Stochastic Dynamic Programming","author":"Puterman","year":"2014"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1146\/annurev-control-042920-020211"},{"issue":"1","key":"ref84","first-page":"1437","article-title":"A comprehensive survey on safe reinforcement learning","volume":"16","author":"Garc\u0131a","year":"2015","journal-title":"J. Mach. Learn. Res."},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-73959-1_12"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/614"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/CDC.2018.8619572"},{"key":"ref88","first-page":"826","article-title":"L1-GP: L1 adaptive control with Bayesian learning","volume-title":"Proc. Conf. Learn. Dyn. Control","author":"Gahlawat"},{"key":"ref89","first-page":"324","article-title":"Probabilistic robust linear quadratic regulators with Gaussian processes","volume-title":"Proc. Conf. Learn. Dyn. Control","author":"von Rohr"},{"key":"ref90","first-page":"444","article-title":"Regret-based reward elicitation for Markov decision processes","volume-title":"Proc. 25th Conf. Uncertain Artif. Intell.","author":"Regan"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1287\/moor.1080.0324"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1287\/moor.27.2.294.324"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1016\/B978-1-55860-335-6.50021-0"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1287\/mnsc.18.7.356"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1016\/j.camwa.2005.11.013"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8793611"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1287\/opre.1050.0216"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1527\/tjsai.16.353"},{"key":"ref99","first-page":"1651","article-title":"Policy gradients with variance related risk criteria","volume-title":"Proc. 29th Int. Conf. Mach. Learn.","author":"Tamar"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1177\/0278364910371999"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1016\/B978-1-55860-247-2.50017-6"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1007\/s10846-013-9826-6"},{"key":"ref103","first-page":"1451","article-title":"Safe exploration in Markov decision processes","volume-title":"Proc. 29th Int. Conf. Mach. Learn.","author":"Moldovan"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/ROBOT.2010.5509832"},{"key":"ref105","first-page":"1000","article-title":"Reinforcement learning with human teachers: Evidence of feedback and guidance with implications for learning performance","volume-title":"Proc. 21st Int. Conf. Artif. Intell.","author":"Thomaz"},{"key":"ref106","article-title":"A CMDP-within-online framework for meta-safe reinforcement learning","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Khattar"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i4.20281"},{"key":"ref108","article-title":"Exploration-exploitation in constrained MDPs","author":"Efroni","year":"2020"},{"key":"ref109","first-page":"17183","article-title":"Learning policies with zero or bounded constraint violation for constrained MDPs","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Liu"},{"key":"ref110","article-title":"A provably-efficient model-free algorithm for constrained Markov decision processes","author":"Wei","year":"2021"},{"key":"ref111","article-title":"Reward constrained policy optimization","author":"Tessler","year":"2018"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1016\/j.sysconle.2004.08.007"},{"key":"ref113","volume-title":"Microeconomic Theory","volume":"1","author":"Mas-Colell","year":"1995"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1111\/1468-0262.00296"},{"key":"ref115","first-page":"3121","article-title":"Convergent policy optimization for safe reinforcement learning","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Yu"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/TSP.2019.2925601"},{"issue":"167","key":"ref117","first-page":"1","article-title":"Risk-constrained reinforcement learning with percentile risk criteria","volume":"18","author":"Chow","year":"2018","journal-title":"J. Mach. Learn. Res."},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1515\/mcma.2009.011"},{"key":"ref119","article-title":"Policy gradients beyond expectations: Conditional value-at-risk","author":"Tamar","year":"2014"},{"key":"ref120","article-title":"First order constrained optimization in policy space","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Zhang"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1016\/0893-6080(89)90020-8"},{"key":"ref122","first-page":"908","article-title":"Safe model-based reinforcement learning with stability guarantees","volume-title":"Proc. 31st Int. Conf. Neural Inf. Process. Syst.","author":"Berkenkamp"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/TAC.2020.3024161"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1016\/j.automatica.2013.02.003"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1016\/S0005-1098(99)00214-9"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/TAC.2018.2876389"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/IROS51168.2021.9636468"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/lra.2022.3216996"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1016\/j.automatica.2022.110684"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1002\/rnc.5132"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1109\/CDC40024.2019.9029886"},{"key":"ref132","first-page":"1861","article-title":"Soft actor-critic: Off-policy maximum entropy deep reinforcement learning with a stochastic actor","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Haarnoja"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/TAC.2015.2444131"},{"key":"ref134","volume-title":"Gaussian Processes for Machine Learning","volume":"2","author":"Williams","year":"2006"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1109\/ECC.2015.7330913"},{"key":"ref136","first-page":"465","article-title":"PILCO: A model-based and data-efficient approach to policy search","volume-title":"Proc. 28th Int. Conf. Mach. Learn.","author":"Deisenroth"},{"key":"ref137","first-page":"1889","article-title":"Trust region policy optimization","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Schulman"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v24i1.7727"},{"key":"ref139","first-page":"10630","article-title":"Safe reinforcement learning using advantage-based intervention","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wagener"},{"key":"ref140","first-page":"9780","article-title":"Shortest-path constrained reinforcement learning for sparse reward tasks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Sohn"},{"key":"ref141","first-page":"9797","article-title":"Safe reinforcement learning in constrained Markov decision processes","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wachi"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.5932"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511804441"},{"key":"ref144","first-page":"1522","article-title":"Risk-sensitive and robust decision-making: A CVaR optimization approach","volume-title":"Proc. 28th Int. Conf. Neural Inf. Process. Syst.","author":"Chow"},{"key":"ref145","article-title":"Constrained episodic reinforcement learning in concave-convex and knapsack settings","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Brantley"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.2140\/pjm.1956.6.1"},{"key":"ref147","first-page":"14070","article-title":"Reinforcement learning with convex constraints","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Miryoosefi"},{"key":"ref148","article-title":"Safe policies for reinforcement learning via primal-dual methods","author":"Paternain","year":"2019"},{"key":"ref149","first-page":"6471","article-title":"First-order methods almost always avoid saddle points: The case of vanishing step-sizes","volume-title":"Proc. Neural Inf. Process. Syst.","author":"Panageas"},{"key":"ref150","first-page":"13 303","article-title":"Provably efficient model-free constrained RL with linear function approximation","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Ghosh"},{"key":"ref151","first-page":"2137","article-title":"Provably efficient reinforcement learning with linear function approximation","volume-title":"Proc. 33rd Conf. Learn. Theory","author":"Jin"},{"key":"ref152","first-page":"54341","article-title":"Provably safe reinforcement learning with step-wise violation constraints","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Xiong"},{"key":"ref153","first-page":"263","article-title":"Minimax regret bounds for reinforcement learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Azar"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3238656"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1109\/CDC45484.2021.9683573"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.23919\/ACC45564.2020.9147584"},{"key":"ref157","article-title":"Constrained reinforcement learning with smoothed log barrier function","author":"Zhang","year":"2024"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33013387"},{"key":"ref159","first-page":"1970","article-title":"Safe control under input limits with neural control barrier functions","volume-title":"Proc. 6th Conf. Robot Learn.","volume":"205","author":"Liu"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1109\/LCSYS.2022.3229865"},{"key":"ref161","first-page":"8682","article-title":"Density constrained reinforcement learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Qin"},{"key":"ref162","article-title":"Boosting the actor with dual critic","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dai"},{"key":"ref163","first-page":"2315","article-title":"DualDICE: Behavior-agnostic estimation of discounted stationary distribution corrections","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Nachum"},{"key":"ref164","article-title":"Reinforcement learning via Fenchel-Rockafellar duality","author":"Nachum","year":"2020"},{"key":"ref165","article-title":"Doubly robust bias reduction in infinite horizon off-policy estimation","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Tang"},{"key":"ref166","article-title":"Safe exploration in continuous action spaces","author":"Dalal","year":"2018"},{"key":"ref167","doi-asserted-by":"publisher","DOI":"10.23919\/ACC45564.2020.9147265"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1109\/TAC.2023.3319070"},{"key":"ref169","first-page":"483","article-title":"Cautious reinforcement learning with logical constraints","volume-title":"Proc. 19th Int. Conf. Auton. Agents Multiagent Syst.","author":"Hasanbeig"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-20652-9_22"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10575-8"},{"key":"ref172","article-title":"Towards verifiable and safe model-free reinforcement learning","volume-title":"Proc. CEUR Workshop","author":"Hasanbeig"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-41540-6_17"},{"key":"ref174","first-page":"535","article-title":"An algorithm for distributed reinforcement learning in cooperative multi-agent systems","volume-title":"Proc. 17th Int. Conf. Mach. Learn.","author":"Lauer"},{"key":"ref175","article-title":"Constrained MDPs and the reward hypothesis","author":"Szepesv\u00e1ri","year":"2020"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1287\/moor.25.1.130.15210"},{"key":"ref177","first-page":"66138","article-title":"Last-iterate convergent policy gradient primal-dual methods for constrained MDPs","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Ding"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1109\/CDC.2002.1184811"},{"key":"ref179","volume-title":"Stochastic Approximation: A Dynamical Systems Viewpoint","volume":"48","author":"Borkar","year":"2009"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.1214\/aoms\/1177729586"},{"key":"ref181","article-title":"Asymptopia: An exposition of statistical asymptotic theory. 2000","author":"Pollard","year":"2000"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.1016\/j.automatica.2004.12.003"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/414"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.1007\/s10957-012-9989-5"},{"key":"ref185","first-page":"3110","article-title":"Near-optimal sample complexity bounds for constrained MDPs","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Vaswani"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.1109\/CDC.2002.1184811"},{"key":"ref187","doi-asserted-by":"publisher","DOI":"10.1016\/j.orl.2024.107107"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/347"},{"key":"ref189","volume-title":"Convex Optimization Algorithms","author":"Bertsekas","year":"2015"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.1109\/ACC.2016.7526658"},{"key":"ref191","first-page":"197","article-title":"Multi-criteria reinforcement learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"G\u00e1bor"},{"key":"ref192","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i12.17271"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511801181"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-0348-0439-4_14"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-34106-9_26"},{"key":"ref196","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-013-5368-1"},{"key":"ref197","article-title":"Constrained Markov decision processes","author":"Altman","year":"1995"},{"key":"ref198","first-page":"9111","article-title":"Constrained update projection approach to safe policy optimization","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Yang"},{"key":"ref199","article-title":"Model-free, regret-optimal best policy identification in online CMDPs","author":"Zhou","year":"2023"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1994.6.2.215"},{"key":"ref201","doi-asserted-by":"publisher","DOI":"10.1145\/203330.203343"},{"key":"ref202","doi-asserted-by":"publisher","DOI":"10.1023\/A:1007518724497"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.1109\/9.793723"},{"key":"ref204","first-page":"305","article-title":"ALVINN: An autonomous land vehicle in a neural network","volume-title":"Proc. 1st Int. Conf. Neural Inf. Process. Syst.","author":"Pomerleau"},{"issue":"02","key":"ref205","first-page":"52","article-title":"Path search of unmanned surface vehicle based on topological location","volume":"42","author":"Gu","year":"2019","journal-title":"Navigation China"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.1016\/j.oceaneng.2019.106798"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.3390\/jmse10030420"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1177\/1475090219898566"},{"key":"ref209","doi-asserted-by":"publisher","DOI":"10.1016\/j.oceaneng.2020.107043"},{"key":"ref210","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC45102.2020.9294262"},{"key":"ref211","doi-asserted-by":"publisher","DOI":"10.23919\/ECC.2019.8796030"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1016\/j.trc.2022.103656"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1016\/0191-2615(81)90037-0"},{"key":"ref214","doi-asserted-by":"publisher","DOI":"10.1109\/TCIAIG.2012.2186810"},{"key":"ref215","doi-asserted-by":"publisher","DOI":"10.1007\/BF00992698"},{"key":"ref216","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-006-6226-1"},{"key":"ref217","doi-asserted-by":"publisher","DOI":"10.1109\/IVS.2017.7995918"},{"key":"ref218","doi-asserted-by":"publisher","DOI":"10.1109\/IROS51168.2021.9636847"},{"key":"ref219","first-page":"1278","article-title":"Stochastic backpropagation and approximate inference in deep generative models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Rezende"},{"key":"ref220","article-title":"Continuous control with deep reinforcement learning","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Lillicrap"},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC.2018.8569938"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC.2018.8569552"},{"key":"ref223","doi-asserted-by":"publisher","DOI":"10.1109\/access.2024.3431437"},{"key":"ref224","first-page":"849","article-title":"Policy search for motor primitives in robotics","volume-title":"Proc. 22nd Annu. Conf. Neural Inf. Process. Syst.","author":"Kober"},{"key":"ref225","article-title":"Policy representation via diffusion probability model for reinforcement learning","author":"Yang","year":"2023"},{"key":"ref226","article-title":"PARROT: Data-driven behavioral priors for reinforcement learning","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Singh"},{"key":"ref227","article-title":"Pybullet, a python module for physics simulation for games, robotics and machine learning","author":"Coumans","year":"2016"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.1109\/TAC.1959.1104895"},{"key":"ref229","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4471-0967-9"},{"key":"ref230","first-page":"1357","article-title":"Robot reinforcement learning on the constraint manifold","volume-title":"Proc. Conf. Robot Learn.","author":"Liu"},{"key":"ref231","article-title":"Building healthy recommendation sequences for everyone: A safe reinforcement learning approach","volume-title":"Proc. FAccTRec Workshop","author":"Singh"},{"key":"ref232","doi-asserted-by":"publisher","DOI":"10.1109\/TCOMM.2021.3087787"},{"key":"ref233","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2022.3149396"},{"key":"ref234","doi-asserted-by":"publisher","DOI":"10.2514\/1.I011126"},{"key":"ref235","doi-asserted-by":"publisher","DOI":"10.1109\/TCOMM.2020.3007742"},{"key":"ref236","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v29i1.9561"},{"key":"ref237","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-020-03051-4"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.1109\/MMSP.2019.8901772"},{"key":"ref239","doi-asserted-by":"publisher","DOI":"10.1007\/s10458-019-09430-0"},{"key":"ref240","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2017.10.013"},{"key":"ref241","article-title":"OpenAI gym","author":"Brockman","year":"2016"},{"key":"ref242","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2012.6386109"},{"key":"ref243","doi-asserted-by":"publisher","DOI":"10.1109\/lra.2022.3196132"},{"key":"ref244","article-title":"Optimal and learning control for autonomous robots","author":"Buchli","year":"2017"},{"key":"ref245","doi-asserted-by":"publisher","DOI":"10.1109\/TCST.2019.2949757"},{"key":"ref246","first-page":"1856","article-title":"Soft actor-critic: Off-policy maximum entropy deep reinforcement learning with a stochastic actor","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Haarnoja"},{"key":"ref247","article-title":"Proximal policy optimization algorithms","author":"Schulman","year":"2017"},{"key":"ref248","article-title":"FACMAC: Factored multi-agent centralised policy gradients","author":"Peng","year":"2020"},{"key":"ref249","doi-asserted-by":"publisher","DOI":"10.1109\/IROS51168.2021.9635984"},{"key":"ref250","article-title":"Robosuite: A modular simulation framework and benchmark for robot learning","author":"Zhu","year":"2020"},{"key":"ref251","article-title":"Isaac gym: High performance GPU based physics simulation for robot learning","volume-title":"Proc. 35th Conf. Neural Inf. Process. Syst. Datasets Benchmarks Track","author":"Makoviychuk"},{"key":"ref252","article-title":"Multi-principal assistance games","author":"Fickinger","year":"2020"},{"key":"ref253","first-page":"3330","article-title":"Inequity aversion improves cooperation in intertemporal social dilemmas","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Hughes"},{"key":"ref254","first-page":"464","article-title":"Multi-agent reinforcement learning in sequential social dilemmas","volume-title":"Proc. 16th Conf. Auton. Agents Multiagent Syst.","author":"Leibo"},{"key":"ref255","doi-asserted-by":"publisher","DOI":"10.1093\/0199252866.003.0002"},{"key":"ref256","article-title":"Policy optimization for constrained MDPs with provable fast global convergence","author":"Liu","year":"2021"},{"key":"ref257","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-60990-0_12"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/10746266\/10675394.pdf?arnumber=10675394","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:38:11Z","timestamp":1732667891000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10675394\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12]]},"references-count":257,"journal-issue":{"issue":"12"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2024.3457538","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12]]}}}