{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,30]],"date-time":"2026-03-30T21:00:35Z","timestamp":1774904435320,"version":"3.50.1"},"reference-count":48,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"10","license":[{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61976215"],"award-info":[{"award-number":["61976215"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62176259"],"award-info":[{"award-number":["62176259"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Cybern."],"published-print":{"date-parts":[[2023,10]]},"DOI":"10.1109\/tcyb.2022.3170355","type":"journal-article","created":{"date-parts":[[2022,5,13]],"date-time":"2022-05-13T19:30:45Z","timestamp":1652470245000},"page":"6421-6432","source":"Crossref","is-referenced-by-count":9,"title":["Anti-Martingale Proximal Policy Optimization"],"prefix":"10.1109","volume":"53","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6698-479X","authenticated-orcid":false,"given":"Yang","family":"Gu","sequence":"first","affiliation":[{"name":"Engineering Research Center of Intelligent Control for Underground Space, Ministry of Education, and the School of Information and Control Engineering, China University of Mining and Technology, Xuzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2022-9999","authenticated-orcid":false,"given":"Yuhu","family":"Cheng","sequence":"additional","affiliation":[{"name":"Engineering Research Center of Intelligent Control for Underground Space, Ministry of Education, and the School of Information and Control Engineering, China University of Mining and Technology, Xuzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7587-8027","authenticated-orcid":false,"given":"Kun","family":"Yu","sequence":"additional","affiliation":[{"name":"Engineering Research Center of Intelligent Control for Underground Space, Ministry of Education, and the School of Information and Control Engineering, China University of Mining and Technology, Xuzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5327-1088","authenticated-orcid":false,"given":"Xuesong","family":"Wang","sequence":"additional","affiliation":[{"name":"Engineering Research Center of Intelligent Control for Underground Space, Ministry of Education, and the School of Information and Control Engineering, China University of Mining and Technology, Xuzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref13","first-page":"1746","article-title":"Transfer from multiple MDPs","author":"lazaric","year":"2011","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2018.2821369"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/1390156.1390225"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2019.2899594"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICCPS48487.2020.00015"},{"key":"ref37","first-page":"237","article-title":"A connection between controlled Markov chains and martingales","volume":"9","author":"mandl","year":"1973","journal-title":"Kybernetika"},{"key":"ref14","first-page":"7881","article-title":"Important weighted transfer of samples in reinforcement learning","author":"tirinzoni","year":"2018","journal-title":"Proc 35th Int Conf Mach Learn"},{"key":"ref36","first-page":"1","article-title":"PPO-CMA: proximal policy optimization with covariance matrix adaptation","author":"h\u00e4m\u00e4l\u00e4inen","year":"2020","journal-title":"Proc IEEE 30th Int Workshop Mach Learn Signal Process"},{"key":"ref31","article-title":"Inference-based posterior distribution optimization","author":"wang","year":"2020","journal-title":"IEEE Trans Cybern"},{"key":"ref30","first-page":"1","article-title":"Dueling network architectures for deep reinforcement learning","author":"wang","year":"2016","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref11","article-title":"Imagination-augmented agents for deep reinforcement learning","author":"racani\u00e8re","year":"2018","journal-title":"arXiv 1707 06203"},{"key":"ref33","article-title":"Authentic boundary proximal policy optimization","author":"cheng","year":"2021","journal-title":"IEEE Trans Cybern"},{"key":"ref10","first-page":"1","article-title":"Temporal difference models: Model-free deep RL for model-based control","author":"pong","year":"2018","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref32","article-title":"Proximal policy optimization with policy feedback","author":"gu","year":"2021","journal-title":"IEEE Trans Syst Man Cybern Syst"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CIG.2018.8490422"},{"key":"ref1","doi-asserted-by":"crossref","first-page":"529","DOI":"10.1038\/nature14236","article-title":"Human-level control through deep reinforcement learning","volume":"518","author":"mnih","year":"2015","journal-title":"Nature"},{"key":"ref17","article-title":"Noisy networks for exploration","author":"fortunato","year":"2018","journal-title":"arXiv 1706 10295"},{"key":"ref39","first-page":"589","article-title":"Learning rates for Q-learning","volume":"5","author":"evendar","year":"2003","journal-title":"J Mach Learn Res"},{"key":"ref16","article-title":"Model-agnostic meta-learning for fast adaptation of deep networks","author":"finn","year":"2017","journal-title":"arXiv 1703 03400"},{"key":"ref38","first-page":"191","article-title":"Discrete-time Markov control processes with discounted unbounded costs: Optimality criteria","volume":"28","author":"hern\u00e1ndez-lerma","year":"1992","journal-title":"Kybernetika"},{"key":"ref19","article-title":"Parameter space noise for exploration","author":"plappert","year":"2018","journal-title":"arXiv 1706 01905"},{"key":"ref18","article-title":"NROWAN-DQN: A stable noisy network with noise reduction and online weight adjustment for exploration","author":"han","year":"2020","journal-title":"arXiv 2006 10980"},{"key":"ref24","article-title":"Prioritized experience replay","author":"schaul","year":"2016","journal-title":"arXiv 1511 05952"},{"key":"ref46","article-title":"Deep exploration via bootstrapped DQN","author":"osband","year":"2016","journal-title":"arXiv 1602 04621"},{"key":"ref23","article-title":"On learning intrinsic rewards for policy gradient methods","author":"zheng","year":"2018","journal-title":"arXiv 1804 06459"},{"key":"ref45","first-page":"1","article-title":"Sample-efficient deep reinforcement learning via episodic backward update","author":"lee","year":"2019","journal-title":"Proc Conf Workshop Neural Inf Process Syst"},{"key":"ref26","article-title":"Deep reinforcement learning with quantum-inspired experience replay","author":"wei","year":"2021","journal-title":"IEEE Trans Cybern"},{"key":"ref48","author":"dhariwal","year":"2017","journal-title":"OpenAI Baselines"},{"key":"ref25","article-title":"Distributed prioritized experience replay","author":"horgan","year":"2018","journal-title":"arXiv 1803 00933"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-29946-9_12"},{"key":"ref20","first-page":"1","article-title":"Intrinsically motivated reinforcement learning","author":"singh","year":"2004","journal-title":"Proc Conf Workshop Neural Inf Process Syst"},{"key":"ref42","article-title":"Provably efficient exploration in policy optimization","author":"cai","year":"2019","journal-title":"arXiv 912 05830v1"},{"key":"ref41","article-title":"A Lyapunov-based approach to safe reinforcement learning","author":"chow","year":"2018","journal-title":"arXiv 1805 07708"},{"key":"ref22","article-title":"Intrinsic reward driven imitation learning via generative model","author":"yu","year":"2020","journal-title":"arXiv 2006 15061"},{"key":"ref44","article-title":"High-dimensional continuous control using generalized advantage estimation","author":"schulman","year":"2015","journal-title":"arXiv 1506 02438 [cs]"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.70"},{"key":"ref43","article-title":"Proximal policy optimization algorithms","author":"schulman","year":"2017","journal-title":"arXiv 1707 06347"},{"key":"ref28","first-page":"1","article-title":"Hindsight experience replay","author":"andrychowicz","year":"2017","journal-title":"Proc Conf Workshop Neural Inf Process Syst"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ADPRL.2014.7010631"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11595"},{"key":"ref8","first-page":"7444","article-title":"SOLAR: deep structured representations for model-based reinforcement learning","author":"zhang","year":"2019","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/820"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2014.2341582"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/SSCI.2018.8628631"},{"key":"ref3","article-title":"Creating pro-level AI for a real-time fighting game using deep reinforcement learning","author":"oh","year":"2021","journal-title":"IEEE Trans Games"},{"key":"ref6","article-title":"Network-scale traffic signal control via multiagent reinforcement learning with deep spatiotemporal attentive network","author":"huang","year":"2021","journal-title":"IEEE Trans Cybern"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2019.2927869"},{"key":"ref40","article-title":"Reinforcement learning in signaling game","author":"hu","year":"2011","journal-title":"arXiv 1103 5818"}],"container-title":["IEEE Transactions on Cybernetics"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6221036\/10252107\/09774969.pdf?arnumber=9774969","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,10,2]],"date-time":"2023-10-02T18:02:24Z","timestamp":1696269744000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9774969\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10]]},"references-count":48,"journal-issue":{"issue":"10"},"URL":"https:\/\/doi.org\/10.1109\/tcyb.2022.3170355","relation":{},"ISSN":["2168-2267","2168-2275"],"issn-type":[{"value":"2168-2267","type":"print"},{"value":"2168-2275","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,10]]}}}