{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,19]],"date-time":"2025-09-19T17:31:09Z","timestamp":1758303069465,"version":"3.44.0"},"reference-count":55,"publisher":"Springer Science and Business Media LLC","issue":"10","license":[{"start":{"date-parts":[[2025,4,29]],"date-time":"2025-04-29T00:00:00Z","timestamp":1745884800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,4,29]],"date-time":"2025-04-29T00:00:00Z","timestamp":1745884800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62376041","62102347"],"award-info":[{"award-number":["62376041","62102347"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002858","name":"China Postdoctoral Science Foundation","doi-asserted-by":"publisher","award":["2021M69236"],"award-info":[{"award-number":["2021M69236"]}],"id":[{"id":"10.13039\/501100002858","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Key Laboratory of Symbolic Computation and Knowledge Engineering of Ministry of Education, Jilin University","award":["93K172021K01"],"award-info":[{"award-number":["93K172021K01"]}]},{"name":"State Key Lab. for Novel Software Technology, Nanjing University","award":["KFKT2024B51"],"award-info":[{"award-number":["KFKT2024B51"]}]},{"DOI":"10.13039\/501100007129","name":"Shandong Provincial Natural Science Foundation","doi-asserted-by":"crossref","award":["ZR2023MF0766"],"award-info":[{"award-number":["ZR2023MF0766"]}],"id":[{"id":"10.13039\/501100007129","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1007\/s10489-024-06083-9","type":"journal-article","created":{"date-parts":[[2025,4,28]],"date-time":"2025-04-28T22:49:05Z","timestamp":1745880545000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Enhancing model learning in reinforcement learning through Q-function-guided trajectory alignment"],"prefix":"10.1007","volume":"55","author":[{"given":"Xin","family":"Du","sequence":"first","affiliation":[]},{"given":"Shan","family":"Zhong","sequence":"additional","affiliation":[]},{"given":"Shengrong","family":"Gong","sequence":"additional","affiliation":[]},{"given":"Yali","family":"Si","sequence":"additional","affiliation":[]},{"given":"Zhenyu","family":"Qi","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,4,29]]},"reference":[{"key":"6083_CR1","unstructured":"Sutton RS (2018) Reinforcement learning: An introduction. A Bradford Book"},{"key":"6083_CR2","doi-asserted-by":"publisher","first-page":"529","DOI":"10.1038\/nature14236","volume":"518","author":"V Mnih","year":"2015","unstructured":"Mnih V, Kavukcuoglu K, Silver D, Rusu AA, Veness J, Bellemare MG, Graves A, Riedmiller M, Fidjeland AK, Ostrovski G et al (2015) Human-level control through deep reinforcement learning. Nature 518:529\u2013533","journal-title":"Nature"},{"key":"6083_CR3","doi-asserted-by":"publisher","first-page":"411","DOI":"10.1146\/annurev-control-042920-020211","volume":"5","author":"L Brunke","year":"2022","unstructured":"Brunke L, Greeff M, Hall AW, Yuan Z, Zhou S, Panerati J, Schoellig AP (2022) Safe learning in robotics: From learning-based control to safe reinforcement learning. Ann Rev Control Robot Auton Syst 5:411\u2013444","journal-title":"Ann Rev Control Robot Auton Syst"},{"key":"6083_CR4","doi-asserted-by":"crossref","unstructured":"Fu Q, Han Z, Chen J, Lu Y, Wu H, Wang Y (2022) Applications of reinforcement learning for building energy efficiency control: A review. J Build Eng 50","DOI":"10.1016\/j.jobe.2022.104165"},{"key":"6083_CR5","first-page":"3689","volume":"69","author":"Z Peng","year":"2022","unstructured":"Peng Z, Luo R, Hu J, Shi K, Ghosh BK (2022) Distributed optimal tracking control of discrete-time multiagent systems via event-triggered reinforcement learning. IEEE Trans Circ Syst 69:3689\u20133700","journal-title":"IEEE Trans Circ Syst"},{"key":"6083_CR6","unstructured":"Wu J, Ma H, Deng C, Long M (2024) Pre-training contextualized world models with in-the-wild videos for reinforcement learning. In: In Proceedings of the international conference on advances in neural information processing systems(NeurIPS)"},{"key":"6083_CR7","doi-asserted-by":"publisher","first-page":"110335","DOI":"10.1016\/j.knosys.2023.110335","volume":"264","author":"X Chen","year":"2023","unstructured":"Chen X, Yao L, McAuley J, Zhou G, Wang X (2023) Deep reinforcement learning in recommender systems: A survey and new perspectives. Knowl Based Syst 264:110335","journal-title":"Knowl Based Syst"},{"key":"6083_CR8","unstructured":"Fujimoto S, Chang W-D, Smith E, Gu SS, Precup D, Meger D (2024) For sale: State-action representation learning for deep reinforcement learning. Adv Neural Inf Process Syst 36"},{"key":"6083_CR9","unstructured":"Bhatt A, Palenicek D, Belousov B, Argus M, Amiranashvili A, Brox T, Peters J (2019) Crossq: Batch normalization in deep reinforcement learning for greater sample efficiency and simplicity. arXiv:1902.05605"},{"key":"6083_CR10","doi-asserted-by":"crossref","unstructured":"Shang Z, Li R, Zheng C, Li H, Cui Y (2023) Relative entropy regularized sample-efficient reinforcement learning with continuous actions. IEEE Trans Neural Netw Learn Syst","DOI":"10.36227\/techrxiv.20141084"},{"key":"6083_CR11","unstructured":"Hiraoka T, Imagawa T, Hashimoto T, Onishi T, Tsuruoka Y (2021) Dropout q-functions for doubly efficient reinforcement learning. In: International conference on learning representations"},{"key":"6083_CR12","unstructured":"Wang T, Bao X, Clavera I, Hoang J, Wen Y, Langlois E, Zhang S, Zhang G, Abbeel P, Ba J (2019) Benchmarking model-based reinforcement learning. arXiv:1907.02057"},{"key":"6083_CR13","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1561\/2200000086","volume":"16","author":"TM Moerland","year":"2023","unstructured":"Moerland TM, Broekens J, Plaat A, Jonker CM et al (2023) Model-based reinforcement learning: A survey. Found Trends Mach Learn 16:1\u2013118","journal-title":"Found Trends Mach Learn"},{"key":"6083_CR14","doi-asserted-by":"publisher","first-page":"121101","DOI":"10.1007\/s11432-022-3696-5","volume":"67","author":"F-M Luo","year":"2024","unstructured":"Luo F-M, Xu T, Lai H, Chen X-H, Zhang W, Yu Y (2024) A survey on model-based reinforcement learning. Sci Chin Inf Sci 67:121101","journal-title":"Sci Chin Inf Sci"},{"key":"6083_CR15","doi-asserted-by":"crossref","unstructured":"Nagabandi A, Kahn G, Fearing RS, Levine S (2018) Neural network dynamics for model-based deep reinforcement learning with model-free fine-tuning. In: Proceedings of the international conference on robotics and automation (ICRA)","DOI":"10.1109\/ICRA.2018.8463189"},{"key":"6083_CR16","unstructured":"Luo Y, Xu H, Li Y, Tian Y, Darrell T, Ma T (2019) Algorithmic framework for model-based deep reinforcement learning with theoretical guarantees. In: Proceedings of the international conference on learning representations (ICLR)"},{"key":"6083_CR17","unstructured":"Janner M, Fu J, Zhang M, Levine S (2019) When to trust your model: Model-based policy optimization. In: Proceedings of the international conference on advances in neural information processing systems(NeurIPS)"},{"key":"6083_CR18","unstructured":"Lambert N, Amos B, Yadan O, Calandra R (2020) Objective mismatch in model-based reinforcement learning. In: Proceedings of the conference on learning for dynamics and control (L4DC)"},{"key":"6083_CR19","doi-asserted-by":"crossref","unstructured":"Venkatraman A, Capobianco R, Pinto L, Hebert M, Nardi D, Bagnell JA (2017) Improved learning of dynamics models for control. In: In Proceedings of the conference on international symposium on experimental robotics (ISER)","DOI":"10.1007\/978-3-319-50115-4_61"},{"key":"6083_CR20","unstructured":"Asadi K, Misra D, Kim S, Littman ML (2019) Combating the compounding-error problem with a multi-step model. arXiv:1905.13320"},{"key":"6083_CR21","unstructured":"Schulman J, Levine S, Abbeel P, Jordan M, Moritz P (2015) Trust region policy optimization. In: In Proceedings of the international conference on machine learning (ICML)"},{"key":"6083_CR22","unstructured":"Wu Y-H, Fan T-H, Ramadge PJ, Su H (2019) Model imitation for model-based reinforcement learning. arXiv:1909.11821"},{"key":"6083_CR23","unstructured":"Wu C, Li T, Zhang Z, Yu Y (2022) Bayesian optimistic optimization: Optimistic exploration for model-based reinforcement learning. In: Proceedings of the international conference on advances in neural information processing systems (NeurIPS)"},{"key":"6083_CR24","unstructured":"Kurutach T, Clavera I, Duan Y, Tamar A, Abbeel P (2018) Model-ensemble trust-region policy optimization. arXiv:1802.10592"},{"key":"6083_CR25","unstructured":"Chua K, Calandra R, McAllister R, Levine S (2018) Deep reinforcement learning in a handful of trials using probabilistic dynamics models. In: Proceedings of the international conference on advances in neural information processing systems (NeurIPS)"},{"key":"6083_CR26","unstructured":"Ji T, Luo Y, Sun F, Jing M, He F, Huang W (2022) When to update your model: Constrained model-based reinforcement learning. In: Proceedings of the international conference on advances in neural information processing systems (NeurIPS)"},{"key":"6083_CR27","unstructured":"Buckman J, Hafner D, Tucker G, Brevdo E, Lee H (2018) Sample-efficient reinforcement learning with stochastic ensemble value expansion. In: Proceedings of the international conference on advances in neural information processing systems (NeurIPS)"},{"key":"6083_CR28","doi-asserted-by":"crossref","unstructured":"De Boer P-T, Kroese DP, Mannor S, Rubinstein RY (2005) A tutorial on the cross-entropy method. Ann Oper Res 134","DOI":"10.1007\/s10479-005-5724-z"},{"key":"6083_CR29","unstructured":"Wang T, Ba J (2019) Exploring model-based planning with policy networks. arXiv:1906.08649"},{"key":"6083_CR30","doi-asserted-by":"crossref","unstructured":"Nguyen DH, Widrow B (1990) Neural networks for self-learning control systems. IEEE Control Syst Mag 10","DOI":"10.1109\/37.55119"},{"key":"6083_CR31","unstructured":"Heess N, Wayne G, Silver D, Lillicrap T, Erez T, Tassa Y (2015) Learning continuous control policies by stochastic value gradients. In: Proceedings of the international conference on advances in neural information processing systems (NeurIPS)"},{"key":"6083_CR32","unstructured":"Clavera I, Fu V, Abbeel P (2020) Model-augmented actor-critic: Backpropagating through paths. arXiv:2005.08068"},{"key":"6083_CR33","unstructured":"Amos B, Stanton S, Yarats D, Wilson AG (2021) On the model-based stochastic value gradient for continuous reinforcement learning. In: Proceedings of the conference on learning for dynamics and control (L4DC)"},{"key":"6083_CR34","unstructured":"Feinberg V, Wan A, Stoica I, Jordan MI, Gonzalez JE, Levine S (2018) Model-based value estimation for efficient model-free reinforcement learning. arXiv:1803.00101"},{"key":"6083_CR35","doi-asserted-by":"crossref","unstructured":"Zhou Q, Li H, Wang J (2020) Deep model-based reinforcement learning via estimated uncertainty and conservative policy optimization. In: Proceedings of the AAAI conference on artificial intelligence (AAAI)","DOI":"10.1609\/aaai.v34i04.6177"},{"key":"6083_CR36","unstructured":"Wu Z, Yu C, Chen C, Hao J, Zhuo HH (2022) Plan to predict: Learning an uncertainty-foreseeing model for model-based reinforcement learning. In: Proceedings of the international conference on advances in neural information processing systems (NeurIPS)"},{"key":"6083_CR37","unstructured":"Shen J, Zhao H, Zhang W, Yu Y (2020) Model-based policy optimization with unsupervised model adaptation. In: Proceedings of the international conference on advances in neural information processing systems (NeurIPS)"},{"key":"6083_CR38","doi-asserted-by":"crossref","unstructured":"Wang Z, Wang J, Zhou Q, Li B, Li H (2022) Sample-efficient reinforcement learning via conservative model-based actor-critic. In: Proceedings of AAAI conference on artificial intelligence (AAAI)","DOI":"10.1609\/aaai.v36i8.20839"},{"key":"6083_CR39","unstructured":"Haarnoja T, Zhou A, Abbeel P, Levine S (2018) Soft actor-critic: Off-policy maximum entropy deep reinforcement learning with a stochastic actor. In: Proceedings of the international conference on machine learning (ICML)"},{"key":"6083_CR40","doi-asserted-by":"crossref","unstructured":"Dong K, Luo Y, Wang Y, Liu Y, Qu C, Zhang Q, Cheng E, Sun Z, Song B (2024) Dyna-style model-based reinforcement learning with model-free policy optimization. Knowledge-Based Systems, 111428","DOI":"10.1016\/j.knosys.2024.111428"},{"key":"6083_CR41","unstructured":"Zhang, H., Yu, H., Zhao, J., Zhang, D., Zhou, H., Huang, C., Ye, C., et al.: How to fine-tune the model: Unified model shift and model bias policy optimization. In: In Proceedings of the International Conference on Advances in Neural Information Processing Systems(NeurIPS) (2024)"},{"key":"6083_CR42","unstructured":"Vemula A, Song Y, Singh A, Bagnell D, Choudhury S (2023) The virtues of laziness in model-based rl: A unified objective and algorithms. In: Proceedings of the international conference on machine learning (ICML)"},{"key":"6083_CR43","unstructured":"Rigter M, Lacerda B, Hawes N (2024) One risk to rule them all: A risk-sensitive perspective on model-based offline reinforcement learning. In: Proceedings of the international conference on advances in neural information processing systems (NeurIPS)"},{"key":"6083_CR44","unstructured":"Yu T, Thomas G, Yu L, Ermon S, Zou JY, Levine S, Finn C, Ma T (2020) Mopo: Model-based offline policy optimization. In: Proceedings of the international conference on advances in neural information processing systems (NeurIPS)"},{"key":"6083_CR45","unstructured":"Pan F, He J, Tu D, He Q (2020) Trust the model when it is confident: Masked model-based actor-critic. In: Proceedings of the international conference on advances in neural information processing systems(NeurIPS)"},{"key":"6083_CR46","unstructured":"Hansen NA, Su H, Wang X (2022) Temporal difference learning for model predictive control. In: International conference on machine learning, pp 8387\u20138406. PMLR"},{"key":"6083_CR47","unstructured":"Nair S, Savarese S, Finn C (2020) Goal-aware prediction: Learning to model what matters. In: In Proceedings of the international conference on machine learning (ICML)"},{"key":"6083_CR48","unstructured":"Chen X-H, Yu Y, Zhu Z-M, Yu Z, Chen Z, Wang C, Wu Y, Wu H, Qin R-J, Ding R, et al (2022) Adversarial counterfactual environment model learning. arXiv:2206.04890"},{"key":"6083_CR49","doi-asserted-by":"crossref","unstructured":"Jia C, Zhang F, Xu T, Pang J-C, Zhang Z, Yu Y (2024) Model gradient: unified model and policy learning in model-based reinforcement learning. Front Comput Sci 18:184339","DOI":"10.1007\/s11704-023-3150-5"},{"key":"6083_CR50","unstructured":"Deisenroth M, Rasmussen CE (2011) Pilco: A model-based and data-efficient approach to policy search. In: In Proceedings of the international conference on machine learning (ICML)"},{"key":"6083_CR51","doi-asserted-by":"publisher","first-page":"6950","DOI":"10.1109\/TII.2022.3142323","volume":"18","author":"Y Cui","year":"2022","unstructured":"Cui Y, Peng L, Li H (2022) Filtered probabilistic model predictive control-based reinforcement learning for unmanned surface vehicles. IEEE Trans Ind Inf 18:6950\u20136961","journal-title":"IEEE Trans Ind Inf"},{"key":"6083_CR52","doi-asserted-by":"crossref","unstructured":"Talvitie E (2017) Self-correcting models for model-based reinforcement learning. In: In Proceedings of the AAAI conference on artificial intelligence (AAAI)","DOI":"10.1609\/aaai.v31i1.10850"},{"key":"6083_CR53","doi-asserted-by":"crossref","unstructured":"Lambert N, Wilcox A, Zhang H, Pister KS, Calandra R (2021) Learning accurate long-term dynamics for model-based reinforcement learning. In: Proceedings of the IEEE conference on decision and control (CDC)","DOI":"10.1109\/CDC45484.2021.9683134"},{"key":"6083_CR54","unstructured":"Mondal AK, Panigrahi SS, Rajeswar S, Siddiqi K, Ravanbakhsh S (2024) Efficient dynamics modeling in interactive environments with koopman theory. In: Proceedings of the international conference on machine learning (ICML)"},{"key":"6083_CR55","unstructured":"Schulman J, Wolski F, Dhariwal P, Radford A, Klimov O (2017) Proximal policy optimization algorithms"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-024-06083-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-024-06083-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-024-06083-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,19]],"date-time":"2025-09-19T13:56:33Z","timestamp":1758290193000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-024-06083-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,29]]},"references-count":55,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2025,7]]}},"alternative-id":["6083"],"URL":"https:\/\/doi.org\/10.1007\/s10489-024-06083-9","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"type":"print","value":"0924-669X"},{"type":"electronic","value":"1573-7497"}],"subject":[],"published":{"date-parts":[[2025,4,29]]},"assertion":[{"value":"16 November 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 April 2025","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}},{"value":"The experimental environment used in this article is OpenAI\u2019s open-source library, Gym, which we are therefore permitted to use.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical and informed consent for data used"}}],"article-number":"700"}}