{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,8]],"date-time":"2025-07-08T04:03:57Z","timestamp":1751947437782,"version":"3.41.2"},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,2,20]],"date-time":"2025-02-20T00:00:00Z","timestamp":1740009600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,20]],"date-time":"2025-02-20T00:00:00Z","timestamp":1740009600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/100000181","name":"Air Force Office of Scientific Research","doi-asserted-by":"publisher","award":["FA2386-22-1-4026","FA2386-24-1-4012"],"award-info":[{"award-number":["FA2386-22-1-4026","FA2386-24-1-4012"]}],"id":[{"id":"10.13039\/100000181","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Auton Agent Multi-Agent Syst"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s10458-025-09695-8","type":"journal-article","created":{"date-parts":[[2025,2,20]],"date-time":"2025-02-20T10:28:06Z","timestamp":1740047286000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Low variance trust region optimization with independent actors and sequential updates in cooperative multi-agent reinforcement learning"],"prefix":"10.1007","volume":"39","author":[{"given":"Bang Giang","family":"Le","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Viet Cuong","family":"Ta","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,2,20]]},"reference":[{"key":"9695_CR1","unstructured":"Schulman, J., Levine, S., Abbeel, P., Jordan, M., & Moritz, P. (2015). Trust region policy optimization. In International conference on machine learning (pp. 1889\u20131897). PMLR."},{"key":"9695_CR2","unstructured":"Schulman, J., Wolski, F., Dhariwal, P., Radford, A., & Klimov, O. (2017). Proximal policy optimization algorithms. arXiv preprint arXiv:1707.06347"},{"key":"9695_CR3","unstructured":"Kuba, J. G., Chen, R., Wen, M., Wen, Y., Sun, F., Wang, J., & Yang, Y. (2022). Trust region policy optimisation in multi-agent reinforcement learning. In International conference on learning representations. https:\/\/openreview.net\/forum?id=EcGGFkNTxdJ"},{"issue":"1","key":"9695_CR4","first-page":"7234","volume":"21","author":"T Rashid","year":"2020","unstructured":"Rashid, T., Samvelyan, M., De Witt, C. S., Farquhar, G., Foerster, J., & Whiteson, S. (2020). Monotonic value function factorisation for deep multi-agent reinforcement learning. The Journal of Machine Learning Research, 21(1), 7234\u20137284.","journal-title":"The Journal of Machine Learning Research"},{"key":"9695_CR5","unstructured":"Lowe, R., Wu, Y. I., Tamar, A., Harb, J., Pieter\u00a0Abbeel, O., & Mordatch, I. (2017). Multi-agent actor-critic for mixed cooperative-competitive environments. In Advances in neural information processing systems (vol. 30)."},{"key":"9695_CR6","unstructured":"Witt, C. S., Gupta, T., Makoviichuk, D., Makoviychuk, V., Torr, P. H., Sun, M., & Whiteson, S. (2020). Is independent learning all you need in the starcraft multi-agent challenge? arXiv preprint arXiv:2011.09533"},{"key":"9695_CR7","first-page":"24611","volume":"35","author":"C Yu","year":"2022","unstructured":"Yu, C., Velu, A., Vinitsky, E., Gao, J., Wang, Y., Bayen, A., & Wu, Y. (2022). The surprising effectiveness of ppo in cooperative multi-agent games. Advances in Neural Information Processing Systems, 35, 24611\u201324624.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"9695_CR8","first-page":"5527","volume":"33","author":"C Daskalakis","year":"2020","unstructured":"Daskalakis, C., Foster, D. J., & Golowich, N. (2020). Independent policy gradient methods for competitive reinforcement learning. Advances in Neural Information Processing Systems, 33, 5527\u20135540.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"9695_CR9","unstructured":"Leonardos, S., Overman, W., Panageas, I., & Piliouras, G. (2022). Global convergence of multi-agent policy gradient in Markov potential games. In International conference on learning representations. https:\/\/openreview.net\/forum?id=gfwON7rAm4"},{"key":"9695_CR10","unstructured":"Ding, D., Wei, C. -Y., Zhang, K., & Jovanovic, M. (2022). Independent policy gradient for large-scale Markov potential games: Sharper rates, function approximation, and game-agnostic convergence. In International conference on machine learning (pp. 5166\u20135220). PMLR."},{"key":"9695_CR11","doi-asserted-by":"crossref","unstructured":"Foerster, J., Farquhar, G., Afouras, T., Nardelli, N., & Whiteson, S. (2018). Counterfactual multi-agent policy gradients. In Proceedings of the AAAI conference on artificial intelligence (vol. 32).","DOI":"10.1609\/aaai.v32i1.11794"},{"key":"9695_CR12","unstructured":"Engstrom, L., Ilyas, A., Santurkar, S., Tsipras, D., Janoos, F., Rudolph, L., & Madry, A. (2019). Implementation matters in deep rl: A case study on ppo and trpo. In International conference on learning representations."},{"key":"9695_CR13","unstructured":"Hernandez-Leal, P., Kaisers, M., Baarslag, T., De\u00a0Cote, E. M. (2017). A survey of learning in multiagent environments: Dealing with non-stationarity. arXiv preprint arXiv:1707.09183"},{"key":"9695_CR14","first-page":"16509","volume":"35","author":"M Wen","year":"2022","unstructured":"Wen, M., Kuba, J., Lin, R., Zhang, W., Wen, Y., Wang, J., & Yang, Y. (2022). Multi-agent reinforcement learning is a sequence modeling problem. Advances in Neural Information Processing Systems, 35, 16509\u201316521.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"9695_CR15","unstructured":"Zhao, Y., Yang, Z., Wang, Z., & Lee, J. D. (2023). Local optimization achieves global optimality in multi-agent reinforcement learning. In Proceedings of the 40th international conference on machine learning (vol. 202, pp. 42200\u201342226). https:\/\/proceedings.mlr.press\/v202\/zhao23j.html"},{"key":"9695_CR16","unstructured":"Li, W., Wang, X., Jin, B., Sheng, J., & Zha, H. (2022). Dealing with non-stationarity in MARL via trust-region decomposition. In International conference on learning representations. https:\/\/openreview.net\/forum?id=XHUxf5aRB3s"},{"key":"9695_CR17","first-page":"12208","volume":"34","author":"B Peng","year":"2021","unstructured":"Peng, B., Rashid, T., Witt, C., Kamienny, P.-A., Torr, P., B\u00f6hmer, W., & Whiteson, S. (2021). Facmac: Factored multi-agent centralised policy gradients. Advances in Neural Information Processing Systems, 34, 12208\u201312221.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"9695_CR18","doi-asserted-by":"crossref","unstructured":"Todorov, E., Erez, T., & Tassa, Y. (2012). Mujoco: A physics engine for model-based control. In 2012 IEEE\/RSJ international conference on intelligent robots and systems (pp. 5026\u20135033). IEEE.","DOI":"10.1109\/IROS.2012.6386109"},{"key":"9695_CR19","first-page":"26437","volume":"34","author":"Z Wu","year":"2021","unstructured":"Wu, Z., Yu, C., Ye, D., Zhang, J., Zhuo, H. H., et al. (2021). Coordinated proximal policy optimization. Advances in Neural Information Processing Systems, 34, 26437\u201326448.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"9695_CR20","unstructured":"Liu, B., Cai, Q., Yang, Z., & Wang, Z. (2019). Neural trust region\/proximal policy optimization attains globally optimal policy. In Advances in neural information processing systems (vol. 32)."},{"key":"9695_CR21","unstructured":"Huang, N. -C., Hsieh, P. -C., Ho, K. -H., Yao, H. -Y., Hu, K. -C., Ouyang, L. -C., & Wu, I. (2021). Neural ppo-clip attains global optimality: A hinge loss perspective. arXiv preprint arXiv:2110.13799"},{"key":"9695_CR22","unstructured":"Kakade, S., & Langford, J. (2002). Approximately optimal approximate reinforcement learning. In Proceedings of the 19th international conference on machine learning (pp. 267\u2013274)."},{"key":"9695_CR23","doi-asserted-by":"crossref","unstructured":"Shani, L., Efroni, Y., & Mannor, S. (2020). Adaptive trust region policy optimization: Global convergence and faster rates for regularized mdps. In Proceedings of the AAAI conference on artificial intelligence (vol. 34, pp. 5668\u20135675).","DOI":"10.1609\/aaai.v34i04.6021"},{"key":"9695_CR24","unstructured":"Papoudakis, G., Christianos, F., Sch\u00e4fer, L., & Albrecht, S. V. (2020). Benchmarking multi-agent deep reinforcement learning algorithms in cooperative tasks. arXiv preprint arXiv:2006.07869"},{"key":"9695_CR25","unstructured":"Christianos, F., Papoudakis, G., & Albrecht, S. V. (2023). Pareto actor-critic for equilibrium selection in multi-agent reinforcement learning. In Transactions on machine learning research."},{"key":"9695_CR26","unstructured":"Witt, C. S., Peng, B., Kamienny, P. -A., Torr, P., B\u00f6hmer, W., & Whiteson, S. (2020). Deep multi-agent reinforcement learning for decentralized continuous cooperative control (vol. 19). arXiv preprint arXiv:2003.06709"},{"key":"9695_CR27","unstructured":"Samvelyan, M., Rashid, T., Witt, C. S., Farquhar, G., Nardelli, N., Rudner, T. G. J., Hung, C. -M., Torr, P. H. S., Foerster, J., & Whiteson, S. (2019). The StarCraft multi-agent challenge. CoRR abs\/1902.04043."},{"issue":"32","key":"9695_CR28","first-page":"1","volume":"25","author":"Y Zhong","year":"2024","unstructured":"Zhong, Y., Kuba, J. G., Feng, X., Hu, S., Ji, J., & Yang, Y. (2024). Heterogeneous-agent reinforcement learning. Journal of Machine Learning Research, 25(32), 1\u201367.","journal-title":"Journal of Machine Learning Research"},{"issue":"13","key":"9695_CR29","doi-asserted-by":"publisher","first-page":"3521","DOI":"10.1073\/pnas.1611835114","volume":"114","author":"J Kirkpatrick","year":"2017","unstructured":"Kirkpatrick, J., Pascanu, R., Rabinowitz, N., Veness, J., Desjardins, G., Rusu, A. A., Milan, K., Quan, J., Ramalho, T., Grabska-Barwinska, A., et al. (2017). Overcoming catastrophic forgetting in neural networks. Proceedings of the National Academy of Sciences, 114(13), 3521\u20133526.","journal-title":"Proceedings of the National Academy of Sciences"},{"key":"9695_CR30","doi-asserted-by":"publisher","first-page":"12873","DOI":"10.1109\/TNNLS.2023.3265358","volume":"35","author":"H Li","year":"2023","unstructured":"Li, H., & He, H. (2023). Multiagent trust region policy optimization. IEEE Transactions on Neural Networks and Learning Systems, 35, 12873.","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"},{"key":"9695_CR31","unstructured":"Gu, S., Kuba, J. G., Wen, M., Chen, R., Wang, Z., Tian, Z., Wang, J., Knoll, A., & Yang, Y. (2021). Multi-agent constrained policy optimisation. arXiv preprint arXiv:2110.02793"},{"key":"9695_CR32","unstructured":"Tucker, G., Bhupatiraju, S., Gu, S., Turner, R., Ghahramani, Z., & Levine, S. (2018). The mirage of action-dependent baselines in reinforcement learning. In International conference on machine learning (pp. 5015\u20135024). PMLR."},{"key":"9695_CR33","unstructured":"Chung, W., Thomas, V., Machado, M. C., & Le\u00a0Roux, N. (2021). Beyond variance reduction: Understanding the true impact of baselines on policy optimization. In International conference on machine learning (pp. 1999\u20132009). PMLR."},{"key":"9695_CR34","unstructured":"Munos, R., Stepleton, T., Harutyunyan, A., & Bellemare, M. (2016). Safe and efficient off-policy reinforcement learning. In Advances in neural information processing systems (vol. 29)."},{"key":"9695_CR35","first-page":"13458","volume":"34","author":"JG Kuba","year":"2021","unstructured":"Kuba, J. G., Wen, M., Meng, L., Zhang, H., Mguni, D., Wang, J., Yang, Y., et al. (2021). Settling the variance of multi-agent policy gradients. Advances in Neural Information Processing Systems, 34, 13458\u201313470.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"9695_CR36","unstructured":"Wang, X., Tian, Z., Wan, Z., Wen, Y., Wang, J., & Zhang, W. (2023). Order matters: Agent-by-agent policy optimization. In The 11th international conference on learning representations. https:\/\/openreview.net\/forum?id=Q-neeWNVv1"},{"key":"9695_CR37","unstructured":"Sunehag, P., Lever, G., Gruslys, A., Czarnecki, W. M., Zambaldi, V., Jaderberg, M., Lanctot, M., Sonnerat, N., Leibo, J. Z., Tuyls, K., & Graepel, T. (2018). Value-decomposition networks for cooperative multi-agent learning based on team reward. In Proceedings of the 17th International conference on autonomous agents and multiagent systems. AAMAS \u201918, Richland, SC (pp. 2085\u20132087)."},{"key":"9695_CR38","unstructured":"Neu, G., Jonsson, A., & G\u00f3mez, V. (2017). A unified view of entropy-regularized markov decision processes. CoRR abs\/1705.07798."},{"issue":"1","key":"9695_CR39","doi-asserted-by":"publisher","first-page":"1059","DOI":"10.1007\/s10107-022-01816-5","volume":"198","author":"G Lan","year":"2023","unstructured":"Lan, G. (2023). Policy mirror descent for reinforcement learning: Linear convergence, new sampling complexity, and generalized problem classes. Mathematical Programming, 198(1), 1059\u20131106.","journal-title":"Mathematical Programming"},{"key":"9695_CR40","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611974997","volume-title":"First-order methods in optimization","author":"A Beck","year":"2017","unstructured":"Beck, A. (2017). First-order methods in optimization. SIAM-Society for Industrial and Applied Mathematics."},{"issue":"1","key":"9695_CR41","first-page":"4431","volume":"22","author":"A Agarwal","year":"2021","unstructured":"Agarwal, A., Kakade, S. M., Lee, J. D., & Mahajan, G. (2021). On the theory of policy gradient methods: Optimality, approximation, and distribution shift. The Journal of Machine Learning Research, 22(1), 4431\u20134506.","journal-title":"The Journal of Machine Learning Research"},{"key":"9695_CR42","unstructured":"Schulman, J., Chen, X., & Abbeel, P. (2017). Equivalence between policy gradients and soft q-learning. arXiv preprint arXiv:1704.06440"}],"container-title":["Autonomous Agents and Multi-Agent Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10458-025-09695-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10458-025-09695-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10458-025-09695-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,7]],"date-time":"2025-07-07T03:53:27Z","timestamp":1751860407000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10458-025-09695-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,20]]},"references-count":42,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["9695"],"URL":"https:\/\/doi.org\/10.1007\/s10458-025-09695-8","relation":{},"ISSN":["1387-2532","1573-7454"],"issn-type":[{"type":"print","value":"1387-2532"},{"type":"electronic","value":"1573-7454"}],"subject":[],"published":{"date-parts":[[2025,2,20]]},"assertion":[{"value":"5 February 2025","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 February 2025","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"12"}}