{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,15]],"date-time":"2026-01-15T06:22:33Z","timestamp":1768458153564,"version":"3.49.0"},"reference-count":43,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:00:00Z","timestamp":1750291200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:00:00Z","timestamp":1750291200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62103403"],"award-info":[{"award-number":["62103403"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U22A2056"],"award-info":[{"award-number":["U22A2056"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1007\/s10489-025-06693-x","type":"journal-article","created":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T09:01:40Z","timestamp":1750323700000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Reducing the value function over-estimation by Kullback-Leibler divergence regularized distributional actor-critic"],"prefix":"10.1007","volume":"55","author":[{"given":"Mingrong","family":"Gong","sequence":"first","affiliation":[]},{"given":"Zhengkun","family":"Yi","sequence":"additional","affiliation":[]},{"given":"Yidong","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Huiyun","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5539-4260","authenticated-orcid":false,"given":"Yunduan","family":"Cui","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,6,19]]},"reference":[{"issue":"7553","key":"6693_CR1","doi-asserted-by":"publisher","first-page":"436","DOI":"10.1038\/nature14539","volume":"521","author":"Y LeCun","year":"2015","unstructured":"LeCun Y, Bengio Y, Hinton G (2015) Deep learning. Nature 521(7553):436\u2013444","journal-title":"Nature"},{"key":"6693_CR2","unstructured":"Sutton RS, Barto AG (2018) Reinforcement learning: An introduction"},{"issue":"6419","key":"6693_CR3","doi-asserted-by":"publisher","first-page":"1140","DOI":"10.1126\/science.aar6404","volume":"362","author":"D Silver","year":"2018","unstructured":"Silver D, Hubert T, Schrittwieser J, Antonoglou I, Lai M, Guez A, Lanctot M, Sifre L, Kumaran D, Graepel T et al (2018) A general reinforcement learning algorithm that masters chess, shogi, and go through self-play. Science 362(6419):1140\u20131144","journal-title":"Science"},{"issue":"8","key":"6693_CR4","doi-asserted-by":"publisher","first-page":"2488","DOI":"10.1007\/s10489-020-01637-z","volume":"50","author":"E Lin","year":"2020","unstructured":"Lin E, Chen Q, Qi X (2020) Deep reinforcement learning for imbalanced classification. Appl Intell 50(8):2488\u20132502","journal-title":"Appl Intell"},{"key":"6693_CR5","unstructured":"Lillicrap TP, Hunt JJ, Pritzel A, Heess N, Erez T, Tassa Y, Silver D, Wierstra D (2016) Continuous Control with Deep Reinforcement Learning. In: International Conference on Learning Representations (ICLR)"},{"issue":"12","key":"6693_CR6","doi-asserted-by":"publisher","first-page":"15188","DOI":"10.1007\/s10489-022-04217-5","volume":"53","author":"Z Song","year":"2023","unstructured":"Song Z, Wang Y, Qian P, Song S, Coenen F, Jiang Z, Su J (2023) From deterministic to stochastic: an interpretable stochastic model-free reinforcement learning framework for portfolio optimization. Appl Intell 53(12):15188\u201315203","journal-title":"Appl Intell"},{"key":"6693_CR7","unstructured":"Fujimoto S, Hoof H, Meger D (2018) Addressing Function Approximation Error in Actor-critic Methods. In: International Conference on Machine Learning (ICML), pp. 1587\u20131596. PMLR"},{"key":"6693_CR8","doi-asserted-by":"crossref","unstructured":"Meng L, Gorbet R, Kuli\u0107 D (2021) The effect of multi-step methods on overestimation in deep reinforcement learning. In: 2020 25th International Conference on Pattern Recognition (ICPR), pp. 347\u2013353. IEEE","DOI":"10.1109\/ICPR48806.2021.9413027"},{"issue":"11","key":"6693_CR9","doi-asserted-by":"publisher","first-page":"6584","DOI":"10.1109\/TNNLS.2021.3082568","volume":"33","author":"J Duan","year":"2021","unstructured":"Duan J, Guan Y, Li SE, Ren Y, Sun Q, Cheng B (2021) Distributional soft actor-critic: Off-policy reinforcement learning for addressing value estimation errors. IEEE transactions on neural networks and learning systems. 33(11):6584\u20136598","journal-title":"IEEE transactions on neural networks and learning systems."},{"key":"6693_CR10","doi-asserted-by":"publisher","first-page":"5823","DOI":"10.1609\/aaai.v37i5.25722","volume":"37","author":"L Meng","year":"2023","unstructured":"Meng L, Ge Z, Tian P, An B, Gao Y (2023) An efficient deep reinforcement learning algorithm for solving imperfect information extensive-form games. Proceedings of the AAAI Conference on Artificial Intelligence 37:5823\u20135831","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"issue":"1","key":"6693_CR11","doi-asserted-by":"publisher","first-page":"9","DOI":"10.1023\/A:1022633531479","volume":"3","author":"RS Sutton","year":"1988","unstructured":"Sutton RS (1988) Learning to predict by the methods of temporal differences. Mach Learn 3(1):9\u201344","journal-title":"Mach Learn"},{"issue":"7","key":"6693_CR12","doi-asserted-by":"publisher","first-page":"7195","DOI":"10.1007\/s10462-022-10348-5","volume":"56","author":"Y Lei","year":"2023","unstructured":"Lei Y, Ye D, Shen S, Sui Y, Zhu T, Zhou W (2023) New challenges in reinforcement learning: a survey of security and privacy. Artif Intell Rev 56(7):7195\u20137236","journal-title":"Artif Intell Rev"},{"key":"6693_CR13","unstructured":"Chen M, He G (2023) Efficient and Stable Off-policy Training Via Behavior-aware Evolutionary Learning. In: Conference on Robot Learning, pp. 482\u2013491. PMLR"},{"key":"6693_CR14","unstructured":"Haarnoja T, Zhou A, Abbeel P, Levine S (2018) Soft Actor-critic: Off-policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor. In: International Conference on Machine Learning, pp. 1861\u20131870. PMLR"},{"issue":"11","key":"6693_CR15","doi-asserted-by":"publisher","first-page":"4933","DOI":"10.1109\/TNNLS.2019.2959129","volume":"31","author":"D Wu","year":"2020","unstructured":"Wu D, Dong X, Shen J, Hoi SCH (2020) Reducing estimation bias via triplet-average deep deterministic policy gradient. IEEE Transactions on Neural Networks and Learning Systems. 31(11):4933\u20134945","journal-title":"IEEE Transactions on Neural Networks and Learning Systems."},{"issue":"13","key":"6693_CR16","doi-asserted-by":"publisher","first-page":"16893","DOI":"10.1007\/s10489-022-04354-x","volume":"53","author":"G Wu","year":"2023","unstructured":"Wu G, Fang W, Wang J, Ge P, Cao J, Ping Y, Gou P (2023) Dyna-ppo reinforcement learning with gaussian process for the continuous action decision-making in autonomous driving. Appl Intell 53(13):16893\u201316907","journal-title":"Appl Intell"},{"issue":"7","key":"6693_CR17","doi-asserted-by":"publisher","first-page":"4600","DOI":"10.1109\/TSMC.2021.3098451","volume":"52","author":"Y Gu","year":"2022","unstructured":"Gu Y, Cheng Y, Chen CLP, Wang X (2022) Proximal policy optimization with policy feedback. IEEE Transactions on Systems, Man, and Cybernetics: Systems. 52(7):4600\u20134610","journal-title":"IEEE Transactions on Systems, Man, and Cybernetics: Systems."},{"key":"6693_CR18","doi-asserted-by":"publisher","first-page":"21770","DOI":"10.1609\/aaai.v38i19.30177","volume":"38","author":"H Zhang","year":"2024","unstructured":"Zhang H, Lin Y, Shen S, Han S, Lv K (2024) Enhancing off-policy constrained reinforcement learning through adaptive ensemble c estimation. Proceedings of the AAAI Conference on Artificial Intelligence 38:21770\u201321778","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"issue":"10","key":"6693_CR19","doi-asserted-by":"publisher","first-page":"1943","DOI":"10.1177\/01423312231225782","volume":"46","author":"Y Tao","year":"2024","unstructured":"Tao Y, Tao H, Zhuang Z, Stojanovic V, Paszke W (2024) Quantized iterative learning control of communication-constrained systems with encoding and decoding mechanism. Trans Inst Meas Control 46(10):1943\u20131954","journal-title":"Trans Inst Meas Control"},{"key":"6693_CR20","doi-asserted-by":"publisher","DOI":"10.1016\/j.aei.2024.102986","volume":"64","author":"Y Sun","year":"2025","unstructured":"Sun Y, Tao H, Stojanovic V (2025) Pseudo-label guided dual classifier domain adversarial network for unsupervised cross-domain fault diagnosis with small samples. Adv Eng Inform 64:102986","journal-title":"Adv Eng Inform"},{"issue":"8","key":"6693_CR21","doi-asserted-by":"publisher","first-page":"3857","DOI":"10.1109\/TCSI.2024.3371492","volume":"71","author":"Z Du","year":"2024","unstructured":"Du Z, Xie X, Qu Z, Hu Y, Stojanovic V (2024) Dynamic event-triggered consensus control for interval type-2 fuzzy multi-agent systems. IEEE Trans Circuits Syst I Regul Pap 71(8):3857\u20133866","journal-title":"IEEE Trans Circuits Syst I Regul Pap"},{"key":"6693_CR22","unstructured":"Bellemare MG, Dabney W, Munos R (2017) A Distributional Perspective on Reinforcement Learning. In: International Conference on Machine Learning, pp. 449\u2013458. PMLR"},{"key":"6693_CR23","doi-asserted-by":"crossref","unstructured":"Dabney W, Rowland M, Bellemare M, Munos R (2018) Distributional Reinforcement Learning with Quantile Regression. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 32","DOI":"10.1609\/aaai.v32i1.11791"},{"key":"6693_CR24","unstructured":"Kuznetsov A, Shvechikov P, Grishin A, Vetrov D (2020) Controlling Overestimation Bias with Truncated Mixture of Continuous Distributional Quantile Critics. In: International Conference on Machine Learning, pp. 5556\u20135566. PMLR"},{"issue":"1","key":"6693_CR25","first-page":"3207","volume":"13","author":"MG Azar","year":"2012","unstructured":"Azar MG, G\u00f3mez V, Kappen HJ (2012) Dynamic policy programming. The Journal of Machine Learning Research (JMLR). 13(1):3207\u20133245","journal-title":"The Journal of Machine Learning Research (JMLR)."},{"key":"6693_CR26","unstructured":"Kozuno T, Uchibe E, Doya K (2019) Theoretical Analysis of Efficiency and Robustness of Softmax and Gap-increasing Operators in Reinforcement Learning. In: The 22nd International Conference on Artificial Intelligence and Statistics, pp. 2995\u20133003. PMLR"},{"key":"6693_CR27","first-page":"12163","volume":"33","author":"N Vieillard","year":"2020","unstructured":"Vieillard N, Kozuno T, Scherrer B, Pietquin O, Munos R, Geist M (2020) Leverage the Average: an Analysis of Kl Regularization in Reinforcement Learning. Advances in Neural Information Processing Systems (NeurIPS) 33:12163\u201312174","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"6693_CR28","doi-asserted-by":"publisher","first-page":"13","DOI":"10.1016\/j.neunet.2017.06.007","volume":"94","author":"Y Cui","year":"2017","unstructured":"Cui Y, Matsubara T, Sugimoto K (2017) Kernel dynamic policy programming: Applicable reinforcement learning to robot systems with high dimensional states. Neural Netw 94:13\u201323","journal-title":"Neural Netw"},{"issue":"21","key":"6693_CR29","doi-asserted-by":"publisher","first-page":"24847","DOI":"10.1007\/s10489-023-04867-z","volume":"53","author":"R Li","year":"2023","unstructured":"Li R, Shang Z, Zheng C, Li H, Liang Q, Cui Y (2023) Efficient distributional reinforcement learning with kullback-leibler divergence regularization. Appl Intell 53(21):24847\u201324863","journal-title":"Appl Intell"},{"key":"6693_CR30","doi-asserted-by":"publisher","first-page":"72","DOI":"10.1016\/j.robot.2018.11.004","volume":"112","author":"Y Tsurumine","year":"2019","unstructured":"Tsurumine Y, Cui Y, Uchibe E, Matsubara T (2019) Deep reinforcement learning with smooth policy update: Application to robotic cloth manipulation. Robot Auton Syst 112:72\u201383","journal-title":"Robot Auton Syst"},{"key":"6693_CR31","doi-asserted-by":"publisher","DOI":"10.1016\/j.conengprac.2020.104331","volume":"97","author":"L Zhu","year":"2020","unstructured":"Zhu L, Cui Y, Takami G, Kanokogi H, Matsubara T (2020) Scalable reinforcement learning for plant-wide control of vinyl acetate monomer process. Control Eng Pract 97:104331","journal-title":"Control Eng Pract"},{"issue":"11","key":"6693_CR32","doi-asserted-by":"publisher","first-page":"11011","DOI":"10.1088\/1742-5468\/2005\/11\/P11011","volume":"2005","author":"HJ Kappen","year":"2005","unstructured":"Kappen HJ (2005) Path integrals and symmetry breaking for optimal control theory. J Stat Mech: Theory Exp 2005(11):11011","journal-title":"J Stat Mech: Theory Exp"},{"key":"6693_CR33","doi-asserted-by":"crossref","unstructured":"Todorov E (2006) Linearly-solvable Markov Decision Problems. In: Advances in Neural Information Processing Systems (NeurIPS), vol. 19","DOI":"10.7551\/mitpress\/7503.003.0176"},{"key":"6693_CR34","unstructured":"Rowland M, Dadashi R, Kumar S, Munos R, Bellemare MG, Dabney W (2019) Statistics and Samples in Distributional Reinforcement Learning. In: International Conference on Machine Learning, pp. 5528\u20135536. PMLR"},{"key":"6693_CR35","doi-asserted-by":"publisher","first-page":"3704","DOI":"10.1109\/TASE.2024.3398712","volume":"22","author":"C Miao","year":"2025","unstructured":"Miao C, Cui Y, Li H, Wu X (2025) Effective multi-agent deep reinforcement learning control with relative entropy regularization. IEEE Trans Autom Sci Eng 22:3704\u20133718","journal-title":"IEEE Trans Autom Sci Eng"},{"key":"6693_CR36","doi-asserted-by":"crossref","unstructured":"Huber PJ (1992) Robust estimation of a location parameter. Breakthroughs in statistics: Methodology and distribution, 492\u2013518","DOI":"10.1007\/978-1-4612-4380-9_35"},{"key":"6693_CR37","unstructured":"Brockman G, Cheung V, Pettersson L, Schneider J, Schulman J, Tang J, Zaremba W (2016) Openai gym. arXiv:1606.01540"},{"key":"6693_CR38","unstructured":"Towers M, Kwiatkowski A, Terry J, Balis JU, De\u00a0Cola G, Deleu T, Goulao M, Kallinteris A, Krimmel M, KG A et al (2024) Gymnasium: A standard interface for reinforcement learning environments. arXiv:2407.17032"},{"key":"6693_CR39","doi-asserted-by":"crossref","unstructured":"Todorov E, Erez T, Tassa Y (2012) MuJoCo: A Physics Engine for Model-based Control. In: IEEE\/RSJ International Conference on Intelligent Robots and Systems, pp. 5026\u20135033","DOI":"10.1109\/IROS.2012.6386109"},{"key":"6693_CR40","unstructured":"Paszke A (2019) PyTorch: An Imperative Style, High-Performance Deep Learning Library. Advances in Neural Information Processing Systems (Neurips) 32:8024\u20138035"},{"key":"6693_CR41","unstructured":"Singh R, Lee K, Chen Y (2022) Sample-based Distributional Policy Gradient. In: Learning for Dynamics and Control Conference, pp. 676\u2013688. PMLR"},{"key":"6693_CR42","unstructured":"Schulman J, Wolski F, Dhariwal P, Radford A, Klimov O (2017) Proximal policy optimization algorithms. arXiv:1707.06347"},{"key":"6693_CR43","first-page":"61573","volume":"36","author":"S Fujimoto","year":"2023","unstructured":"Fujimoto S, Chang W-D, Smith EJ, Gu SS, Precup D, Meger D (2023) For SALE: State-Action Representation Learning for Deep Reinforcement Learning. Advances in Neural Information Processing Systems (Neurips) 36:61573\u201361624","journal-title":"Advances in Neural Information Processing Systems (Neurips)"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-025-06693-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-025-06693-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-025-06693-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,19]],"date-time":"2025-09-19T13:39:21Z","timestamp":1758289161000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-025-06693-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,19]]},"references-count":43,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2025,7]]}},"alternative-id":["6693"],"URL":"https:\/\/doi.org\/10.1007\/s10489-025-06693-x","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"value":"0924-669X","type":"print"},{"value":"1573-7497","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,6,19]]},"assertion":[{"value":"31 May 2025","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 June 2025","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"793"}}