{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T00:49:13Z","timestamp":1775782153650,"version":"3.50.1"},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2025,4,10]],"date-time":"2025-04-10T00:00:00Z","timestamp":1744243200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,4,10]],"date-time":"2025-04-10T00:00:00Z","timestamp":1744243200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s10994-025-06763-8","type":"journal-article","created":{"date-parts":[[2025,4,10]],"date-time":"2025-04-10T11:04:44Z","timestamp":1744283084000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Data-efficient reinforcement learning by generalized value estimation"],"prefix":"10.1007","volume":"114","author":[{"given":"Junjie","family":"Zhou","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ying","family":"Tian","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minglun","family":"Ren","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,4,10]]},"reference":[{"key":"6763_CR1","unstructured":"Anschel, O., Baram, N., & Shimkin, N. (2017). Averaged-dqn: Variance reduction and stabilization for deep reinforcement learning. In Proceedings of the 34th international conference on machine learning (ICML\u201917), Sydney, Australia."},{"key":"6763_CR2","doi-asserted-by":"crossref","unstructured":"Asis, K. D., Hernandez-Garcia, J. F., & Holland, G. Z. et\u00a0al. (2018). Multi-step reinforcement learning: A unifying algorithm. In Proceedings of the 32nd AAAI conference on artificial intelligence (AAAI\u201918), New Orleans, Louisiana.","DOI":"10.1609\/aaai.v32i1.11631"},{"key":"6763_CR3","unstructured":"Bellemare, M. G., Dabney, W., & Munos, R. (2017). A distributional perspective on reinforcement learning. In Precup, D., Teh, Y. W. (Eds.) Proceedings of the 34th international conference on machine learning (ICML\u201917), Sydney, Australia."},{"key":"6763_CR4","unstructured":"Buckman, J., Hafner, D., & Tucker, G., et\u00a0al. (2018). Sample-efficient reinforcement learning with stochastic ensemble value expansion. In Advances in neural information processing systems 31 (NeurIPS\u201918), Montr\u00e9al, Canada."},{"key":"6763_CR5","unstructured":"Chen, X., Wang, C., & Zhou, Z., et\u00a0al. (2021). Randomized ensembled double q-learning: Learning fast without a model. In 9th International conference on learning representations (ICLR\u201921), Virtual Event."},{"key":"6763_CR6","doi-asserted-by":"crossref","unstructured":"Cho, K., van Merrienboer, B., & G\u00fcl\u00e7ehre, \u00c7 et\u00a0al. (2014). Learning phrase representations using RNN encoder-decoder for statistical machine translation. In Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP\u201914), Doha, Qatar.","DOI":"10.3115\/v1\/D14-1179"},{"key":"6763_CR7","unstructured":"Chua, K., Calandra, R., & McAllister, R., et\u00a0al. (2018). Deep reinforcement learning in a handful of trials using probabilistic dynamics models. In Advances in neural information processing systems 31 (NeurIPS\u201918), Montr\u00e9al, Canada."},{"key":"6763_CR8","unstructured":"Clavera, I., Fu, Y., & Abbeel, P. (2020). Model-augmented actor-critic: Backpropagating through paths. In 8th International conference on learning representations (ICLR\u201920), Addis Ababa, Ethiopia."},{"key":"6763_CR9","doi-asserted-by":"crossref","unstructured":"Dabney, W., Rowland, M., & Bellemare, M. G., et\u00a0al. (2018). Distributional reinforcement learning with quantile regression. In Proceedings of the thirty-second AAAI conference on artificial intelligence (AAAI\u201918), New Orleans, Louisiana.","DOI":"10.1609\/aaai.v32i1.11791"},{"issue":"2","key":"6763_CR10","doi-asserted-by":"publisher","first-page":"179","DOI":"10.1207\/s15516709cog1402_1","volume":"14","author":"JL Elman","year":"1990","unstructured":"Elman, J. L. (1990). Finding structure in time. Cognitive Science, 14(2), 179\u2013211.","journal-title":"Cognitive Science"},{"key":"6763_CR11","unstructured":"Feinberg, V., Wan, A., & Stoica, I., et\u00a0al. (2018). Model-based value estimation for efficient model-free reinforcement learning. CoRR arXiv:1803.00101."},{"key":"6763_CR12","unstructured":"Fujimoto, S., van Hoof, H., & Meger, D. (2018). Addressing function approximation error in actor-critic methods. In Proceedings of the 35th international conference on machine learning (ICML\u201918), Stockholm, Sweden."},{"key":"6763_CR13","unstructured":"Gangwani, T., Zhou, Y., & Peng, J. (2020). Learning guidance rewards with trajectory-space smoothing. In Advances in neural information processing systems 33 (NeurIPS\u201920), Virtual Event."},{"key":"6763_CR14","unstructured":"Haarnoja, T., Zhou, A., & Abbeel, P., et\u00a0al. (2018). Soft actor-critic: Off-policy maximum entropy deep reinforcement learning with a stochastic actor. In Proceedings of the 35th international conference on machine learning (ICML\u201918), Stockholm, Sweden."},{"key":"6763_CR15","doi-asserted-by":"crossref","unstructured":"Hasselt, H. V., Guez, A., & Silver, D. (2016). Deep reinforcement learning with double q-learning. In Proceedings of the 30th AAAI conference on artificial intelligence (AAAI\u201916), Phoenix, Arizona.","DOI":"10.1609\/aaai.v30i1.10295"},{"key":"6763_CR16","doi-asserted-by":"crossref","unstructured":"Hessel, M., Modayil, J., & van Hasselt, H., et\u00a0al. (2018). Rainbow: Combining improvements in deep reinforcement learning. In Proceedings of the thirty-second AAAI conference on artificial intelligence (AAAI\u201918), New Orleans, Louisiana.","DOI":"10.1609\/aaai.v32i1.11796"},{"key":"6763_CR17","unstructured":"Janner, M., Fu, J., & Zhang, M., et\u00a0al. (2019). When to trust your model: Model-based policy optimization. In Advances in neural information processing systems 32 (NeurIPS\u201919), Vancouver, Canada."},{"issue":"3","key":"6763_CR18","doi-asserted-by":"publisher","first-page":"524","DOI":"10.1109\/LCSYS.2020.2970555","volume":"4","author":"I John","year":"2020","unstructured":"John, I., Kamanchi, C., & Bhatnagar, S. (2020). Generalized speedy q-learning. IEEE Control Systems Letters, 4(3), 524\u2013529.","journal-title":"IEEE Control Systems Letters"},{"issue":"2","key":"6763_CR19","doi-asserted-by":"publisher","first-page":"465","DOI":"10.1007\/s10994-021-06116-1","volume":"111","author":"O Kilinc","year":"2022","unstructured":"Kilinc, O., & Montana, G. (2022). Reinforcement learning for robotic manipulation using simulated locomotion demonstrations. Machine Learning, 111(2), 465\u2013486.","journal-title":"Machine Learning"},{"key":"6763_CR20","doi-asserted-by":"crossref","unstructured":"Kim, W., Shin, Y., & Park, J., et\u00a0al. (2023). Sample-efficient and safe deep reinforcement learning via reset deep ensemble agents. In Advances in neural information processing systems 36 (NeurIPS\u201923), New Orleans, LA.","DOI":"10.52202\/075280-2317"},{"issue":"9","key":"6763_CR21","doi-asserted-by":"publisher","first-page":"2501","DOI":"10.1007\/s10994-021-06006-6","volume":"110","author":"S Krishnan","year":"2021","unstructured":"Krishnan, S., Boroujerdian, B., Fu, W., et al. (2021). Air learning: A deep reinforcement learning gym for autonomous aerial robot visual navigation. Machine Learning, 110(9), 2501\u20132540.","journal-title":"Machine Learning"},{"key":"6763_CR22","unstructured":"Lan, Q., Pan, Y., & Fyshe, A., et\u00a0al. (2020). Maxmin q-learning: Controlling the estimation bias of q-learning. In 8th International conference on learning representations (ICLR\u201920), Addis Ababa, Ethiopia."},{"key":"6763_CR23","unstructured":"Li, C., Wang, Y., & Chen, W., et\u00a0al. (2022). Gradient information matters in policy optimization by back-propagating through model. In 10th International conference on learning representations (ICLR\u201922), Virtual Conference, https:\/\/openreview.net\/forum?id=rzvOQrnclO0."},{"key":"6763_CR24","unstructured":"Lillicrap, T. P., Hunt, J. J., & Pritzel, A., et\u00a0al. (2016). Continuous control with deep reinforcement learning. In 4th International conference on learning representations (ICLR\u201916), San Juan, Puerto Rico."},{"key":"6763_CR25","doi-asserted-by":"crossref","unstructured":"Lin, H., Sun, Y., & Zhang, J., et\u00a0al. (2023). Model-based reinforcement learning with multi-step plan value estimation. In Proceedings of the 26th European conference on artificial intelligence (ECAI\u201923), Krak\u00f3w, Poland.","DOI":"10.3233\/FAIA230427"},{"key":"6763_CR26","doi-asserted-by":"crossref","unstructured":"Lin, H., Wu, H., & Zhang, J., et\u00a0al. (2024). Episodic return decomposition by difference of implicitly assigned sub-trajectory reward. In Proceedings of the 38th AAAI conference on artificial intelligence (AAAI\u201924), Vancouver, Canada.","DOI":"10.1609\/aaai.v38i12.29287"},{"key":"6763_CR27","unstructured":"Lin, H., Xu, Y., & Sun, Y., et\u00a0al. (2025). Any-step dynamics model improves future predictions for online and offline reinforcement learning. In The 13th international conference on learning representations (ICLR\u201925), Singapore."},{"key":"6763_CR28","unstructured":"Lu, T., Schuurmans, D., & Boutilier, C. (2018). Non-delusional q-learning and value-iteration. In Advances in neural information processing systems 31 (NeurIPS\u201918), Montr\u00e9al, Canada."},{"key":"6763_CR29","unstructured":"Luo, Y., Xu, H., & Li, Y., et\u00a0al. (2019). Algorithmic framework for model-based deep reinforcement learning with theoretical guarantees. In 7th International conference on learning representations (ICLR\u201919), New Orleans, LA."},{"issue":"7540","key":"6763_CR30","doi-asserted-by":"publisher","first-page":"529","DOI":"10.1038\/nature14236","volume":"518","author":"V Mnih","year":"2015","unstructured":"Mnih, V., Kavukcuoglu, K., Silver, D., et al. (2015). Human-level control through deep reinforcement learning. Nature, 518(7540), 529\u2013533.","journal-title":"Nature"},{"key":"6763_CR31","unstructured":"Nikishin, E., Schwarzer, M., & D\u2019Oro, P., et\u00a0al. (2022). The primacy bias in deep reinforcement learning. In Proceedings of the 39th international conference on machine learning (ICML\u201922), Baltimore, Maryland."},{"key":"6763_CR32","unstructured":"Ren, Z., Guo, R., & Zhou, Y., et\u00a0al. (2022). Learning long-term reward redistribution via randomized return decomposition. In 10th International conference on learning representations (ICLR\u201922), Virtual Event."},{"key":"6763_CR33","unstructured":"Schulman, J., Levine, S., & Abbeel, P., et\u00a0al. (2015). Trust region policy optimization. In Proceedings of the 32nd international conference on machine learning (ICML\u201915), Lille, France."},{"key":"6763_CR34","unstructured":"Schulman, J., Wolski, F., & Dhariwal, P., et\u00a0al. (2017). Proximal policy optimization algorithms. CoRR arXiv:1707.06347."},{"issue":"6419","key":"6763_CR35","doi-asserted-by":"publisher","first-page":"1140","DOI":"10.1126\/science.aar6404","volume":"362","author":"D Silver","year":"2018","unstructured":"Silver, D., Hubert, T., Schrittwieser, J., et al. (2018). A general reinforcement learning algorithm that masters chess, shogi, and go through self-play. Science, 362(6419), 1140\u20131144.","journal-title":"Science"},{"key":"6763_CR36","volume-title":"Reinforcement learning: An introduction","author":"RS Sutton","year":"2018","unstructured":"Sutton, R. S., & Barto, A. G. (2018). Reinforcement learning: An introduction. Cambridge: MIT Press."},{"key":"6763_CR37","doi-asserted-by":"crossref","unstructured":"Todorov, E., Erez, T., & Tassa, Y. (2012). Mujoco: A physics engine for model-based control. In IEEE\/RSJ International conference on intelligent robots and systems (IROS\u201920), Vilamoura, Portugal.","DOI":"10.1109\/IROS.2012.6386109"},{"key":"6763_CR38","unstructured":"van Hasselt, H. (2010a). Double q-learning. In Advances in neural information processing systems 23 (NeurIPS\u201910), Vancouver, Canada."},{"key":"6763_CR39","unstructured":"van Hasselt, H. (2010b). Double q-learning. In Advances in neural information processing systems 23 (NeurIPS\u201910), Vancouver, Canada."},{"key":"6763_CR40","doi-asserted-by":"publisher","first-page":"43098","DOI":"10.1109\/ACCESS.2020.2977400","volume":"8","author":"B Wang","year":"2020","unstructured":"Wang, B., Li, X., Gao, Z., et al. (2020). Risk aversion operator for addressing maximization bias in q-learning. IEEE Access, 8, 43098\u201343110.","journal-title":"IEEE Access"},{"key":"6763_CR41","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2022.109998","volume":"258","author":"B Wang","year":"2022","unstructured":"Wang, B., Wu, J., Li, X., et al. (2022). Uncertainty quantification for operators in online reinforcement learning. Knowledge-Based Systems, 258, 109998.","journal-title":"Knowledge-Based Systems"},{"key":"6763_CR42","unstructured":"Wu, Y., Zhai, S., & Srivastava, N., et\u00a0al. (2021). Uncertainty weighted actor-critic for offline reinforcement learning. In Proceedings of the 38th international conference on machine learning (ICML\u201921), Virtual Event."}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-025-06763-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10994-025-06763-8","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-025-06763-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T00:02:13Z","timestamp":1775779333000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10994-025-06763-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,10]]},"references-count":42,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["6763"],"URL":"https:\/\/doi.org\/10.1007\/s10994-025-06763-8","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,4,10]]},"assertion":[{"value":"11 October 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 March 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 March 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 April 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"Not applicable.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}],"article-number":"133"}}