{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,27]],"date-time":"2026-02-27T01:37:13Z","timestamp":1772156233908,"version":"3.50.1"},"reference-count":55,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"4","license":[{"start":{"date-parts":[[2024,8,1]],"date-time":"2024-08-01T00:00:00Z","timestamp":1722470400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,8,1]],"date-time":"2024-08-01T00:00:00Z","timestamp":1722470400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,8,1]],"date-time":"2024-08-01T00:00:00Z","timestamp":1722470400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62102241"],"award-info":[{"award-number":["62102241"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100007219","name":"Natural Science Foundation of Shanghai Municipality","doi-asserted-by":"publisher","award":["23ZR1425400"],"award-info":[{"award-number":["23ZR1425400"]}],"id":[{"id":"10.13039\/100007219","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Emerg. Top. Comput. Intell."],"published-print":{"date-parts":[[2024,8]]},"DOI":"10.1109\/tetci.2024.3369636","type":"journal-article","created":{"date-parts":[[2024,3,12]],"date-time":"2024-03-12T18:53:34Z","timestamp":1710269614000},"page":"2974-2986","source":"Crossref","is-referenced-by-count":6,"title":["Model-Based Off-Policy Deep Reinforcement Learning With Model-Embedding"],"prefix":"10.1109","volume":"8","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3555-7143","authenticated-orcid":false,"given":"Xiaoyu","family":"Tan","sequence":"first","affiliation":[{"name":"INF Technology Company, Ltd., Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4246-9106","authenticated-orcid":false,"given":"Chao","family":"Qu","sequence":"additional","affiliation":[{"name":"INF Technology Company, Ltd., Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2028-510X","authenticated-orcid":false,"given":"Junwu","family":"Xiong","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6519-676X","authenticated-orcid":false,"given":"James","family":"Zhang","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4024-925X","authenticated-orcid":false,"given":"Xihe","family":"Qiu","sequence":"additional","affiliation":[{"name":"School of Electronic and Electrical Engineering, Shanghai University of Engineering Science, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1100-0631","authenticated-orcid":false,"given":"Yaochu","family":"Jin","sequence":"additional","affiliation":[{"name":"School of Engineering, Westlake University, Hangzhou, China"}]}],"member":"263","reference":[{"key":"ref1","first-page":"2944","article-title":"Learning continuous control policies by stochastic value gradients","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Heess","year":"2015"},{"key":"ref2","first-page":"264","article-title":"Lipschitz continuity in model-based reinforcement learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Asadi","year":"2018"},{"key":"ref3","article-title":"Model-ensemble trust-region policy optimization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kurutach","year":"2018"},{"key":"ref4","first-page":"195","article-title":"Uncertainty-driven imagination for continuous deep reinforcement learning","volume-title":"Proc. Conf. Robot Learn.","author":"Kalweit","year":"2017"},{"key":"ref5","article-title":"Algorithmic framework for model-based deep reinforcement learning with theoretical guarantees","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Luo","year":"2018"},{"key":"ref6","first-page":"617","article-title":"Model-based reinforcement learning via meta-policy optimization","volume-title":"Proc. Conf. Robot Learn.","author":"Clavera","year":"2018"},{"key":"ref7","first-page":"4754","article-title":"Deep reinforcement learning in a handful of trials using probabilistic dynamics models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Chua","year":"2018"},{"key":"ref8","first-page":"12498","article-title":"When to trust your model: Model-based policy optimization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Janner","year":"2019"},{"key":"ref9","article-title":"Reinforcement learning and control as probabilistic inference: Tutorial and review","author":"Levine","year":"2018"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2012.6386109"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2016.2522401"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2018.2790388"},{"key":"ref13","article-title":"A latent batch-constrained deep reinforcement learning approach for precision dosing clinical decision support","volume":"237","author":"Qiu","year":"2021","journal-title":"Knowl.-Based Syst."},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-5007"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2019.2891311"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2017.7989275"},{"key":"ref17","article-title":"A survey of progress on cooperative multi-agent reinforcement learning in open environment","author":"Yuan","year":"2023"},{"key":"ref18","article-title":"A survey on model-based reinforcement learning","volume":"67","author":"Luo","year":"2022","journal-title":"Sci. China Inf. Sci."},{"key":"ref19","article-title":"Prioritized experience replay","author":"Schaul","year":"2015"},{"key":"ref20","first-page":"1889","article-title":"Trust region policy optimization","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Schulman","year":"2015"},{"key":"ref21","first-page":"1928","article-title":"Asynchronous methods for deep reinforcement learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Mnih","year":"2016"},{"key":"ref22","article-title":"Playing atari with deep reinforcement learning","author":"Mnih","year":"2013"},{"key":"ref23","first-page":"29","article-title":"Deep recurrent Q-learning for partially observable MDPs","volume-title":"Proc. AAAI Fall Symp. Ser.","author":"Hausknecht","year":"2015"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v30i1.10295"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3089425"},{"key":"ref26","first-page":"1587","article-title":"Addressing function approximation error in actor-critic methods","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Fujimoto","year":"2018"},{"key":"ref27","article-title":"Continuous control with deep reinforcement learning","author":"Lillicrap","year":"2015"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.3044196"},{"key":"ref29","first-page":"1184","article-title":"Value propagation for decentralized networked deep multi-agent reinforcement learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Qu","year":"2019"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1016\/j.automatica.2021.110076"},{"key":"ref31","article-title":"Model-free and Bayesian ensembling model-based deep reinforcement learning for particle accelerator control demonstrated on the FERMI FEL","author":"Hirlaender","year":"2020"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/IROS51168.2021.9636468"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2015.2511658"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3107375"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2012.6386025"},{"key":"ref36","first-page":"1","article-title":"Guided policy search","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Levine","year":"2013"},{"key":"ref37","first-page":"1071","article-title":"Learning neural network policies with guided policy search under unknown dynamics","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Levine","year":"2014"},{"key":"ref38","first-page":"49","article-title":"Guided cost learning: Deep inverse optimal control via policy optimization","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Finn","year":"2016"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8463189"},{"key":"ref40","article-title":"Robust constrained model predictive control","author":"Richards","year":"2005"},{"key":"ref41","article-title":"Dream to control: Learning behaviors by latent imagination","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Hafner","year":"2019"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1007\/s13198-021-01552-7"},{"key":"ref43","first-page":"4015","article-title":"On the importance of hyperparameter optimization for model-based reinforcement learning","volume-title":"Proc. Int. Conf. Artif. Intell. Statist.","author":"Zhang","year":"2021"},{"key":"ref44","first-page":"1433","article-title":"Maximum entropy inverse reinforcement learning","volume-title":"Proc. AAAI Conf. Artif. Intell.","author":"Ziebart","year":"2008"},{"key":"ref45","first-page":"202","article-title":"Taming the noise in reinforcement learning via soft updates","volume-title":"Proc. 32nd Conf. Uncertainty Artif. Intell.","author":"Fox","year":"2016"},{"key":"ref46","first-page":"1352","article-title":"Reinforcement learning with deep energy-based policies","volume-title":"Proc. 34th Int. Conf. Mach. Learn.","author":"Haarnoja","year":"2017"},{"key":"ref47","first-page":"1861","article-title":"Soft actor-critic: Off-policy maximum entropy deep reinforcement learning with a stochastic actor","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Haarnoja","year":"2018"},{"key":"ref48","article-title":"Model-based value expansion for efficient model-free reinforcement learning","volume-title":"Proc. 35th Int. Conf. Mach. Learn.","author":"Feinberg","year":"2018"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1038\/nature14236"},{"key":"ref50","first-page":"5767","article-title":"Improved training of wasserstein GANs","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Gulrajani","year":"2017"},{"key":"ref51","article-title":"Exploring model-based planning with policy networks","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Wang","year":"2019"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/springerreference_179268"},{"key":"ref53","article-title":"OpenAI gym","author":"Brockman","year":"2016"},{"key":"ref54","article-title":"Categorical reparameterization with gumbel-softmax","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Jang","year":"2016"},{"key":"ref55","article-title":"Soft actor-critic for discrete action settings","author":"Christodoulou","year":"2019"}],"container-title":["IEEE Transactions on Emerging Topics in Computational Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/7433297\/10607834\/10463525.pdf?arnumber=10463525","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,14]],"date-time":"2025-01-14T20:10:56Z","timestamp":1736885456000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10463525\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8]]},"references-count":55,"journal-issue":{"issue":"4"},"URL":"https:\/\/doi.org\/10.1109\/tetci.2024.3369636","relation":{},"ISSN":["2471-285X"],"issn-type":[{"value":"2471-285X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,8]]}}}