{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T03:41:55Z","timestamp":1775187715600,"version":"3.50.1"},"reference-count":59,"publisher":"Springer Science and Business Media LLC","issue":"8093","license":[{"start":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T00:00:00Z","timestamp":1761091200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,11,26]],"date-time":"2025-11-26T00:00:00Z","timestamp":1764115200000},"content-version":"vor","delay-in-days":35,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Nature"],"published-print":{"date-parts":[[2025,12,11]]},"DOI":"10.1038\/s41586-025-09761-x","type":"journal-article","created":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T15:02:19Z","timestamp":1761145339000},"page":"312-319","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Discovering state-of-the-art reinforcement learning algorithms"],"prefix":"10.1038","volume":"648","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4383-6396","authenticated-orcid":false,"given":"Junhyuk","family":"Oh","sequence":"first","affiliation":[]},{"given":"Gregory","family":"Farquhar","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6804-5936","authenticated-orcid":false,"given":"Iurii","family":"Kemaev","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7283-5670","authenticated-orcid":false,"given":"Dan A.","family":"Calian","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9946-4375","authenticated-orcid":false,"given":"Matteo","family":"Hessel","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5864-7632","authenticated-orcid":false,"given":"Luisa","family":"Zintgraf","sequence":"additional","affiliation":[]},{"given":"Satinder","family":"Singh","sequence":"additional","affiliation":[]},{"given":"Hado","family":"van Hasselt","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5197-2892","authenticated-orcid":false,"given":"David","family":"Silver","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,10,22]]},"reference":[{"key":"9761_CR1","unstructured":"Kirsch, L., van Steenkiste, S. & Schmidhuber, J. Improving generalization in meta reinforcement learning using learned objectives. In Proc. International Conference on Learning Representations (ICLR, 2020)."},{"key":"9761_CR2","doi-asserted-by":"crossref","unstructured":"Kirsch, L. et al. Introducing symmetries to black box meta reinforcement learning. In Proc. AAAI Conference on Artificial Intelligence 36, 7202\u20137210 (Association for the Advancement of Artificial Intelligence, 2022).","DOI":"10.1609\/aaai.v36i7.20681"},{"key":"9761_CR3","unstructured":"Oh, J. et al. Discovering reinforcement learning algorithms. In Proc. Adv. Neural Inf. Process. Syst. 33, 1060\u20131070 (NeurIPS, 2020)."},{"key":"9761_CR4","unstructured":"Xu, Z. et al. Meta-gradient reinforcement learning with an objective discovered online. In Proc. Adv. Neural Inf. Process. Syst. 33, 15254\u201315264 (NeurIPS, 2020)."},{"key":"9761_CR5","unstructured":"Houthooft, R. et al. Evolved policy gradients. In Proc. Adv. Neural Inf. Process. Syst. 31, 5405\u20135414 (NeurIPS, 2018)."},{"key":"9761_CR6","unstructured":"Lu, C. et al. Discovered policy optimisation. In Proc. Adv. Neural Inf. Process. Syst. 35, 16455\u201316468 (NeurIPS, 2022)."},{"key":"9761_CR7","doi-asserted-by":"publisher","first-page":"484","DOI":"10.1038\/nature16961","volume":"529","author":"D Silver","year":"2016","unstructured":"Silver, D. et al. Mastering the game of Go with deep neural networks and tree search. Nature 529, 484\u2013489 (2016).","journal-title":"Nature"},{"key":"9761_CR8","doi-asserted-by":"publisher","first-page":"604","DOI":"10.1038\/s41586-020-03051-4","volume":"588","author":"J Schrittwieser","year":"2020","unstructured":"Schrittwieser, J. et al. Mastering Atari, Go, chess and shogi by planning with a learned model. Nature 588, 604\u2013609 (2020).","journal-title":"Nature"},{"key":"9761_CR9","doi-asserted-by":"publisher","first-page":"350","DOI":"10.1038\/s41586-019-1724-z","volume":"575","author":"O Vinyals","year":"2019","unstructured":"Vinyals, O. et al. Grandmaster level in StarCraft II using multi-agent reinforcement learning. Nature 575, 350\u2013354 (2019).","journal-title":"Nature"},{"key":"9761_CR10","doi-asserted-by":"publisher","first-page":"647","DOI":"10.1038\/s41586-025-08744-2","volume":"640","author":"D Hafner","year":"2025","unstructured":"Hafner, D., Pasukonis, J., Ba, J. & Lillicrap, T. Mastering diverse control tasks through world models. Nature 640, 647\u2013653 (2025).","journal-title":"Nature"},{"key":"9761_CR11","doi-asserted-by":"publisher","first-page":"47","DOI":"10.1038\/s41586-022-05172-4","volume":"610","author":"A Fawzi","year":"2022","unstructured":"Fawzi, A. et al. Discovering faster matrix multiplication algorithms with reinforcement learning. Nature 610, 47\u201353 (2022).","journal-title":"Nature"},{"key":"9761_CR12","doi-asserted-by":"publisher","first-page":"414","DOI":"10.1038\/s41586-021-04301-9","volume":"602","author":"J Degrave","year":"2022","unstructured":"Degrave, J. et al. Magnetic control of tokamak plasmas through deep reinforcement learning. Nature 602, 414\u2013419 (2022).","journal-title":"Nature"},{"key":"9761_CR13","unstructured":"Xu, Z., van Hasselt, H. P. & Silver, D. Meta-gradient reinforcement learning. In Proc. Adv. Neural Inf. Process. Syst. 31, 2402\u20132413 (NeurIPS, 2018)."},{"key":"9761_CR14","unstructured":"Zahavy, T. et al. A self-tuning actor\u2013critic algorithm. In Proc. Adv. Neural Inf. Process. Syst. 33, 20913\u201320924 (NeurIPS, 2020)."},{"key":"9761_CR15","unstructured":"Jackson, M. T. et al. Discovering general reinforcement learning algorithms with adversarial environment design. In Proc. Adv. Neural Inf. Process. Syst. 36, 79980\u201379998 (NeurIPS, 2023)."},{"key":"9761_CR16","unstructured":"Sutton, R. S. & Barto, A. G. Reinforcement learning: An Introduction (MIT Press, 2018)."},{"key":"9761_CR17","first-page":"279","volume":"8","author":"CJ Watkins","year":"1992","unstructured":"Watkins, C. J. & Dayan, P. Q-learning. Mach. Learn. 8, 279\u2013292 (1992).","journal-title":"Mach. Learn."},{"key":"9761_CR18","unstructured":"Schulman, J., Wolski, F., Dhariwal, P., Radford, A. & Klimov, O. Proximal policy optimization algorithms. Preprint at https:\/\/arxiv.org\/abs\/1707.06347 (2017)."},{"key":"9761_CR19","unstructured":"Jaderberg, M. et al. Reinforcement learning with unsupervised auxiliary tasks. In Proc. International Conference on Learning Representations (ICLR, 2017)."},{"key":"9761_CR20","unstructured":"Barreto, A. et al. Successor features for transfer in reinforcement learning. In Proc. Adv. Neural Inf. Process. Syst. 30, 4055\u20134065 (NeurIPS, 2017)."},{"key":"9761_CR21","unstructured":"Bellemare, M. G., Dabney, W. & Munos, R. A distributional perspective on reinforcement learning. In Proc. International Conference on Machine Learning 449\u2013458 (PMLR, 2017)."},{"key":"9761_CR22","doi-asserted-by":"publisher","first-page":"253","DOI":"10.1613\/jair.3912","volume":"47","author":"MG Bellemare","year":"2013","unstructured":"Bellemare, M. G., Naddaf, Y., Veness, J. & Bowling, M. The arcade learning environment: an evaluation platform for general agents. J. Artif. Intell. Res. 47, 253\u2013279 (2013).","journal-title":"J. Artif. Intell. Res."},{"key":"9761_CR23","unstructured":"Cobbe, K., Hesse, C., Hilton, J. & Schulman, J. Leveraging procedural generation to benchmark reinforcement learning. In Proc. International Conference on Machine Learning 2048\u20132056 (PMLR, 2020)."},{"key":"9761_CR24","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S. & Schmidhuber, J. Long short-term memory. Neural Comput. 9, 1735\u20131780 (1997).","journal-title":"Neural Comput."},{"key":"9761_CR25","unstructured":"Veeriah, V. et al. Discovery of useful questions as auxiliary tasks. In Proc. Adv. Neural Inf. Process. Syst. 32, 9306\u20139317 (NeurIPS, 2019)."},{"key":"9761_CR26","unstructured":"Munos, R., Stepleton, T., Harutyunyan, A. & Bellemare, M. Safe and efficient off-policy reinforcement learning. In Proc. Adv. Neural Inf. Process. Syst. 29, 1054\u20131062 (NeurIPS, 2016)."},{"key":"9761_CR27","unstructured":"Finn, C., Abbeel, P. & Levine, S. Model-agnostic meta-learning for fast adaptation of deep networks. In Proc. International Conference on Machine Learning 70, 1126\u20131135 (PMLR, 2017)."},{"key":"9761_CR28","unstructured":"Mnih, V. et al. Asynchronous methods for deep reinforcement learning. In Proc. International Conference on Machine Learning 48, 1928\u20131937 (PMLR, 2016)."},{"key":"9761_CR29","unstructured":"Agarwal, R., Schwarzer, M., Castro, P. S., Courville, A. C. & Bellemare, M. Deep reinforcement learning at the edge of the statistical precipice. In Proc. Adv. Neural Inf. Process. Syst. 34, 29304\u201329320 (NeurIPS, 2021)."},{"key":"9761_CR30","unstructured":"Kapturowski, S. et al. Human-level Atari 200x faster. In Proc. International Conference on Learning Representations (ICLR, 2023)."},{"key":"9761_CR31","unstructured":"Hafner, D. Benchmarking the spectrum of agent capabilities. In Proc. International Conference on Learning Representations (ICLR, 2022)."},{"key":"9761_CR32","unstructured":"K\u00fcttler, H. et al. The nethack learning environment. In Proc. Adv. Neural Inf. Process. Syst. 33, 7671\u20137684 (NeurIPS, 2020)."},{"key":"9761_CR33","unstructured":"Hambro, E. et al. Insights from the NeurIPS 2021 NetHack challenge. In Proc. NeurIPS 2021 Competitions and Demonstrations Track 41\u201352 (PMLR, 2022)."},{"key":"9761_CR34","unstructured":"Espeholt, L. et al. IMPALA: scalable distributed deep-RL with importance weighted actor-learner architectures. In Proc. International Conference on Learning Representations (ICLR, 2018)."},{"key":"9761_CR35","unstructured":"Beattie, C. et al. DeepMind Lab. Preprint at https:\/\/arxiv.org\/abs\/1612.03801 (2016)."},{"key":"9761_CR36","unstructured":"Racani\u00e8re, S. et al. Imagination-augmented agents for deep reinforcement learning. In Proc. Adv. Neural Inf. Process. Syst. 30, 5690\u20135701 (NeurIPS, 2017)."},{"key":"9761_CR37","unstructured":"Schmidhuber, J. Evolutionary Principles in Self-referential Learning, or on Learning How to Learn: the Meta-meta-\u2026 Hook. PhD thesis, Technische Univ. M\u00fcnchen (1987)."},{"key":"9761_CR38","doi-asserted-by":"crossref","unstructured":"Schmidhuber, J. A possibility for implementing curiosity and boredom in model-building neural controllers. In Proc. International Conference on Simulation of Adaptive Behavior: from Animals to Animats 222\u2013227 (MIT Press, 1991).","DOI":"10.7551\/mitpress\/3115.003.0030"},{"key":"9761_CR39","unstructured":"Schmidhuber, J., Zhao, J. & Wiering, M. Simple Principles of Metalearning. Report No. IDSIA-69-96 (Istituto Dalle Molle Di Studi Sull Intelligenza Artificiale, 1996)."},{"key":"9761_CR40","doi-asserted-by":"crossref","unstructured":"Thrun, S. & Pratt, L. Learning to Learn: Introduction and Overview 3-17 (Springer, 1998).","DOI":"10.1007\/978-1-4615-5529-2_1"},{"key":"9761_CR41","doi-asserted-by":"publisher","first-page":"1345","DOI":"10.1109\/TKDE.2009.191","volume":"22","author":"SJ Pan","year":"2009","unstructured":"Pan, S. J. & Yang, Q. A survey on transfer learning. IEEE Trans. Knowl. Data Eng. 22, 1345\u20131359 (2009).","journal-title":"IEEE Trans. Knowl. Data Eng."},{"key":"9761_CR42","doi-asserted-by":"publisher","first-page":"54","DOI":"10.1016\/j.neunet.2019.01.012","volume":"113","author":"GI Parisi","year":"2019","unstructured":"Parisi, G. I., Kemker, R., Part, J. L., Kanan, C. & Wermter, S. Continual lifelong learning with neural networks: a review. Neural Netw. 113, 54\u201371 (2019).","journal-title":"Neural Netw."},{"key":"9761_CR43","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1023\/A:1007379606734","volume":"28","author":"R Caruana","year":"1997","unstructured":"Caruana, R. Multitask learning. Mach. Learn. 28, 41\u201375 (1997).","journal-title":"Mach. Learn."},{"key":"9761_CR44","doi-asserted-by":"crossref","unstructured":"Feurer, M. & Hutter, F. Hyperparameter Optimization 3\u201333 (Springer, 2019).","DOI":"10.1007\/978-3-030-05318-5_1"},{"key":"9761_CR45","unstructured":"Yao, Q. et al. Taking human out of learning applications: a survey on automated machine learning. Preprint at https:\/\/www.arxiv.org\/abs\/1810.13306v3 (2018)."},{"key":"9761_CR46","unstructured":"Storck, J., et al. Reinforcement driven information acquisition in non-deterministic environments. In International Conference on Artificial Neural Networks 2, 159\u2013164 (ICANN, 1995)."},{"key":"9761_CR47","unstructured":"Duan, Y. et al. RL2: fast reinforcement learning via slow reinforcement learning. Preprint at https:\/\/arxiv.org\/abs\/1611.02779 (2016)."},{"key":"9761_CR48","doi-asserted-by":"crossref","unstructured":"Niv, Y., Joel, D., Meilijson, I. & Ruppin, E. Evolution of reinforcement learning in uncertain environments: a simple explanation for complex foraging behaviors. Adapt. Behav. 10, 5\u201324 (2002).","DOI":"10.1177\/10597123020101001"},{"key":"9761_CR49","unstructured":"Xiong, Z., Zintgraf, L., Beck, J., Vuorio, R. & Whiteson, S. On the practical consistency of meta-reinforcement learning algorithms. Preprint at https:\/\/arxiv.org\/abs\/2112.00478 (2021)."},{"key":"9761_CR50","unstructured":"Sutton, R. S. & Tanner, B. Temporal-difference networks. In Proc. Adv. Neural Inf. Process. Syst. 17, 1377\u20131384 (NeurIPS, 2004)."},{"key":"9761_CR51","doi-asserted-by":"crossref","unstructured":"Mnih, V., Kavukcuoglu, K., Silver, D. et al. Human-level control through deep reinforcement learning. Nature 518, 529\u2013533 (2015).","DOI":"10.1038\/nature14236"},{"key":"9761_CR52","unstructured":"Cobbe, K., Hilton, J., Klimov, O., and Schulman, J. Phasic policy gradient. In Proc. International Conference on Machine Learning 139, 2020\u20132027 (PMLR, 2021)."},{"key":"9761_CR53","doi-asserted-by":"crossref","unstructured":"Hessel, M. et al. Rainbow: combining improvements in deep reinforcement learning. In Proc. AAAI Conference on Artificial Intelligence 32, 3215\u20133222 (Association for the Advancement of Artificial Intelligence, 2018).","DOI":"10.1609\/aaai.v32i1.11796"},{"key":"9761_CR54","doi-asserted-by":"crossref","unstructured":"Sutton, R. S. Learning to predict by the methods of temporal differences. Mach. Learn. 3, 9\u201344 (1988).","DOI":"10.1007\/BF00115009"},{"key":"9761_CR55","unstructured":"Bradbury, J. et al. JAX: composable transformations of Python+ NumPy programs. http:\/\/github.com\/jax-ml\/jax (2018)."},{"key":"9761_CR56","unstructured":"DeepMind et al. The DeepMind JAX ecosystem. GitHub http:\/\/github.com\/google-deepmind (2020)."},{"key":"9761_CR57","unstructured":"Jouppi, N. P. et al. In-datacenter performance analysis of a tensor processing unit. In Proc. Annual International Symposium on Computer Architecture 1\u201312 (ICSA, 2017)."},{"key":"9761_CR58","unstructured":"Hessel, M. et al. Podracer architectures for scalable reinforcement learning. Preprint at https:\/\/arxiv.org\/abs\/2104.06272 (2021)."},{"key":"9761_CR59","unstructured":"Kemaev, I., Calian, D. A., Zintgraf, L. M., Farquhar, G. & van Hasselt, H. Scalable meta-learning via mixed-mode differentiation. In Proc. International Conference on Machine Learning 267, 29687\u201319605 (PMLR, 2025)."}],"container-title":["Nature"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.nature.com\/articles\/s41586-025-09761-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s41586-025-09761-x","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s41586-025-09761-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T17:28:59Z","timestamp":1765387739000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.nature.com\/articles\/s41586-025-09761-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,22]]},"references-count":59,"journal-issue":{"issue":"8093","published-print":{"date-parts":[[2025,12,11]]}},"alternative-id":["9761"],"URL":"https:\/\/doi.org\/10.1038\/s41586-025-09761-x","relation":{},"ISSN":["0028-0836","1476-4687"],"issn-type":[{"value":"0028-0836","type":"print"},{"value":"1476-4687","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,10,22]]},"assertion":[{"value":"11 December 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 October 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 October 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"A patent application(s) directed to aspects of the work described has been filed and is pending as of the date of manuscript submission. Google LLC has ownership and potential commercial interests in the work described.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}]}}