{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,29]],"date-time":"2026-01-29T21:01:41Z","timestamp":1769720501322,"version":"3.49.0"},"reference-count":55,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"7","license":[{"start":{"date-parts":[[2024,7,1]],"date-time":"2024-07-01T00:00:00Z","timestamp":1719792000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,7,1]],"date-time":"2024-07-01T00:00:00Z","timestamp":1719792000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,7,1]],"date-time":"2024-07-01T00:00:00Z","timestamp":1719792000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Science Foundation of China","doi-asserted-by":"publisher","award":["61976043"],"award-info":[{"award-number":["61976043"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Science Foundation of China","doi-asserted-by":"publisher","award":["61836011"],"award-info":[{"award-number":["61836011"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Neural Netw. Learning Syst."],"published-print":{"date-parts":[[2024,7]]},"DOI":"10.1109\/tnnls.2022.3215596","type":"journal-article","created":{"date-parts":[[2022,11,4]],"date-time":"2022-11-04T00:52:26Z","timestamp":1667523146000},"page":"8783-8796","source":"Crossref","is-referenced-by-count":14,"title":["Improving Exploration in Actor\u2013Critic With Weakly Pessimistic Value Estimation and Optimistic Policy Optimization"],"prefix":"10.1109","volume":"35","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8949-4905","authenticated-orcid":false,"given":"Fan","family":"Li","sequence":"first","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9257-126X","authenticated-orcid":false,"given":"Mingsheng","family":"Fu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China"}]},{"given":"Wenyu","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5894-3237","authenticated-orcid":false,"given":"Fan","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9821-508X","authenticated-orcid":false,"given":"Haixian","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Computer Science, Sichuan University, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6114-3441","authenticated-orcid":false,"given":"Hong","family":"Qu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5867-9322","authenticated-orcid":false,"given":"Zhang","family":"Yi","sequence":"additional","affiliation":[{"name":"School of Computer Science, Sichuan University, Chengdu, China"}]}],"member":"263","reference":[{"key":"ref1","first-page":"10607","article-title":"Prediction-guided multi-objective reinforcement learning for continuous robot control","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Xu"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3059912"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.13140\/RG.2.2.18893.74727"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/tnnls.2021.3117790"},{"key":"ref5","first-page":"4018","article-title":"Diversity actor-critic: Sample-aware entropy regularization for sample-efficient exploration","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Han"},{"key":"ref6","first-page":"1352","article-title":"Reinforcement learning with deep energy-based policies","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Haarnoja"},{"key":"ref7","first-page":"1861","article-title":"Soft actor-critic: Off-policy maximum entropy deep reinforcement learning with a stochastic actor","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Haarnoja"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/475"},{"key":"ref9","article-title":"Surprise-based intrinsic motivation for deep reinforcement learning","author":"Achiam","year":"2017","journal-title":"arXiv:1703.01732"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.70"},{"key":"ref11","first-page":"1587","article-title":"Addressing function approximation error in actor-critic methods","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Fujimoto"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1812.05905"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3082568"},{"key":"ref14","first-page":"1787","article-title":"Better exploration with optimistic actor critic","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Ciosek"},{"key":"ref15","article-title":"Off-policy reinforcement learning with optimistic exploration and distribution correction","author":"Li","year":"2021","journal-title":"arXiv:2110.12081"},{"key":"ref16","article-title":"Reducing conservativeness oriented offline reinforcement learning","author":"Zhang","year":"2021","journal-title":"arXiv:2103.00098"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1016\/B978-1-55860-141-3.50030-4"},{"key":"ref18","first-page":"213","article-title":"R-max-a general polynomial time algorithm for near-optimal reinforcement learning","volume":"3","author":"Brafman","year":"2002","journal-title":"J. Mach. Learn. Res."},{"key":"ref19","first-page":"1","article-title":"Is Q-learning provably efficient?","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"31","author":"Jin"},{"key":"ref20","first-page":"577","article-title":"Principled exploration via optimistic bootstrapping and backward induction","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Bai"},{"key":"ref21","article-title":"Towards tractable optimism in model-based reinforcement learning","author":"Pacchiano","year":"2020","journal-title":"arXiv:2006.11911"},{"key":"ref22","first-page":"1","article-title":"Maxmin Q-learning: Controlling the estimation bias of q-learning","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Lan"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2019.2959129"},{"key":"ref24","article-title":"Openai gym","author":"Brockman","year":"2016","journal-title":"arXiv:1606.01540"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2012.6386109"},{"key":"ref26","first-page":"1008","article-title":"Actor-critic algorithms","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Konda"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/BF00115009"},{"issue":"2","key":"ref28","first-page":"229","article-title":"Reinforcement learning: An introduction","volume":"17","author":"Sutton","year":"1999","journal-title":"Robotica"},{"key":"ref29","first-page":"1","article-title":"Continuous control with deep reinforcement learning","volume-title":"Proc. 4th Int. Conf. Learn. Represent.","author":"Lillicrap"},{"key":"ref30","first-page":"387","article-title":"Deterministic policy gradient algorithms","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Silver"},{"key":"ref31","first-page":"1329","article-title":"Benchmarking deep reinforcement learning for continuous control","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Duan"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11694"},{"key":"ref33","first-page":"12519","article-title":"When to trust your model: Model-based policy optimization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Janner"},{"key":"ref34","article-title":"Model-augmented actor-critic: Backpropagating through paths","author":"Clavera","year":"2020","journal-title":"arXiv:2005.08068"},{"key":"ref35","volume-title":"Modeling Purposeful Adaptive Behavior With the Principle of Maximum Causal Entropy","author":"Ziebart","year":"2010"},{"key":"ref36","article-title":"Distributed distributional deterministic policy gradients","author":"Barth-Maron","year":"2018","journal-title":"arXiv:1804.08617"},{"key":"ref37","first-page":"11767","article-title":"Softmax deep double deterministic policy gradients","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Pan"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/276"},{"key":"ref39","first-page":"5940","article-title":"A regularized approach to sparse optimal policy in reinforcement learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Yang"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/tcyb.2021.3104612"},{"key":"ref41","first-page":"11890","article-title":"Predictive information accelerates learning in RL","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Lee"},{"key":"ref42","first-page":"741","article-title":"Stochastic latent actor-critic: Deep reinforcement learning with a latent variable model","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Lee"},{"key":"ref43","first-page":"255","article-title":"Issues in using function approximation for reinforcement learning","volume-title":"Proc. 4th Connectionist Models Summer School","author":"Thrun"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v30i1.10295"},{"key":"ref45","first-page":"8454","article-title":"Ensemble bootstrapping for Q-learning","volume-title":"Proc. 38th Int. Conf. Mach. Learn.","author":"Peer"},{"key":"ref46","first-page":"104","article-title":"An optimistic perspective on offline reinforcement learning","volume-title":"Proc. 37th Int. Conf. Mach. Learn.","author":"Agarwal"},{"key":"ref47","first-page":"263","article-title":"Minimax regret bounds for reinforcement learning","volume-title":"Proc. 34th Int. Conf. Mach. Learn.","author":"Azar"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1287\/moor.2022.1309"},{"key":"ref49","first-page":"10746","article-title":"Reinforcement learning in feature space: Matrix bandit, kernels, and regret bound","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Yang"},{"key":"ref50","first-page":"1","article-title":"A unifying view of optimism in episodic reinforcement learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Neu"},{"key":"ref51","first-page":"4026","article-title":"Deep exploration via bootstrapped DQN","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"29","author":"Osband"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2018.2795041"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.2307\/2291177"},{"key":"ref54","first-page":"1179","article-title":"Conservative Q-learning for offline reinforcement learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Kumar"},{"key":"ref55","first-page":"1","article-title":"Adam: A method for stochastic optimization","volume-title":"Proc. 3rd Int. Conf. Learn. Represent.","author":"Kingma"}],"container-title":["IEEE Transactions on Neural Networks and Learning Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/5962385\/10589508\/09932556.pdf?arnumber=9932556","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,5]],"date-time":"2024-09-05T18:28:01Z","timestamp":1725560881000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9932556\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7]]},"references-count":55,"journal-issue":{"issue":"7"},"URL":"https:\/\/doi.org\/10.1109\/tnnls.2022.3215596","relation":{},"ISSN":["2162-237X","2162-2388"],"issn-type":[{"value":"2162-237X","type":"print"},{"value":"2162-2388","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,7]]}}}