{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T14:20:27Z","timestamp":1760710827476,"version":"3.37.3"},"reference-count":51,"publisher":"Springer Science and Business Media LLC","license":[{"start":{"date-parts":[[2022,1,5]],"date-time":"2022-01-05T00:00:00Z","timestamp":1641340800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,5]],"date-time":"2022-01-05T00:00:00Z","timestamp":1641340800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/100000006","name":"office of naval researc","doi-asserted-by":"publisher","award":["ONR MURI Grant N00014-16-1-2710"],"award-info":[{"award-number":["ONR MURI Grant N00014-16-1-2710"]}],"id":[{"id":"10.13039\/100000006","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100006831","name":"u.s. air force","doi-asserted-by":"publisher","award":["AFOSR Grant FA9550-19-1-0353"],"award-info":[{"award-number":["AFOSR Grant FA9550-19-1-0353"]}],"id":[{"id":"10.13039\/100006831","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Dyn Games Appl"],"DOI":"10.1007\/s13235-021-00420-0","type":"journal-article","created":{"date-parts":[[2022,1,5]],"date-time":"2022-01-05T14:03:51Z","timestamp":1641391431000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":13,"title":["Provably Efficient Reinforcement Learning in Decentralized General-Sum Markov Games"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8301-4173","authenticated-orcid":false,"given":"Weichao","family":"Mao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tamer","family":"Ba\u015far","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,1,5]]},"reference":[{"issue":"4","key":"420_CR1","doi-asserted-by":"publisher","first-page":"1545","DOI":"10.1109\/TAC.2016.2598476","volume":"62","author":"G Arslan","year":"2016","unstructured":"Arslan G, Y\u00fcksel S (2016) Decentralized Q-learning for stochastic teams and games. IEEE Trans Autom Control 62(4):1545\u20131558","journal-title":"IEEE Trans Autom Control"},{"key":"420_CR2","doi-asserted-by":"crossref","unstructured":"Auer P, Cesa-Bianchi N, Freund Y, Schapire RE (1995) Gambling in a rigged casino: the adversarial multi-armed bandit problem. In: Foundations of computer science. IEEE, pp 322\u2013331","DOI":"10.1109\/SFCS.1995.492488"},{"key":"420_CR3","first-page":"1","volume":"55","author":"RJ Aumann","year":"1987","unstructured":"Aumann RJ (1987) Correlated equilibrium as an expression of Bayesian rationality. Econom J Econom Soc 55:1\u201318","journal-title":"Econom J Econom Soc"},{"key":"420_CR4","unstructured":"Bai Y, Jin C (2020) Provable self-play algorithms for competitive reinforcement learning. In: International conference on machine learning, pp 551\u2013560"},{"key":"420_CR5","unstructured":"Bai Y, Jin C, Yu T (2020) Near-optimal reinforcement learning with self-play. In: Advances in neural information processing systems, vol 33"},{"issue":"4","key":"420_CR6","doi-asserted-by":"publisher","first-page":"819","DOI":"10.1287\/moor.27.4.819.297","volume":"27","author":"DS Bernstein","year":"2002","unstructured":"Bernstein DS, Givan R, Immerman N, Zilberstein S (2002) The complexity of decentralized control of Markov decision processes. Math Oper Res 27(4):819\u2013840","journal-title":"Math Oper Res"},{"issue":"6374","key":"420_CR7","doi-asserted-by":"publisher","first-page":"418","DOI":"10.1126\/science.aao1733","volume":"359","author":"N Brown","year":"2018","unstructured":"Brown N, Sandholm T (2018) Superhuman AI for heads-up no-limit poker: Libratus beats top professionals. Science 359(6374):418\u2013424","journal-title":"Science"},{"key":"420_CR8","doi-asserted-by":"crossref","unstructured":"Bubeck S et al (2015) Convex optimization: algorithms and complexity. Found Trends\u00ae Mach Learn 8(3\u20134):231\u2013357","DOI":"10.1561\/2200000050"},{"key":"420_CR9","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511546921","volume-title":"Prediction, learning, and games","author":"N Cesa-Bianchi","year":"2006","unstructured":"Cesa-Bianchi N, Lugosi G (2006) Prediction, learning, and games. Cambridge University Press, Cambridge"},{"issue":"3","key":"420_CR10","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/1516512.1516516","volume":"56","author":"X Chen","year":"2009","unstructured":"Chen X, Deng X, Teng SH (2009) Settling the complexity of computing two-player Nash equilibria. J ACM 56(3):1\u201357","journal-title":"J ACM"},{"key":"420_CR11","unstructured":"Claus C (1998) Boutilier C (1998) The dynamics of reinforcement learning in cooperative multiagent systems. In: AAAI conference on artificial intelligence, vol 746\u2013752, p 2"},{"issue":"1","key":"420_CR12","doi-asserted-by":"publisher","first-page":"195","DOI":"10.1137\/070699652","volume":"39","author":"C Daskalakis","year":"2009","unstructured":"Daskalakis C, Goldberg PW, Papadimitriou CH (2009) The complexity of computing a Nash equilibrium. SIAM J Comput 39(1):195\u2013259","journal-title":"SIAM J Comput"},{"key":"420_CR13","unstructured":"Daskalakis C, Foster DJ, Golowich N (2020) Independent policy gradient methods for competitive reinforcement learning. In: Advances in neural information processing systems, vol 33"},{"key":"420_CR14","unstructured":"Fang H, Harvey N, Portella V, Friedlander M (2020) Online mirror descent and dual averaging: keeping pace in the dynamic case. In: International conference on machine learning, pp 3008\u20133017"},{"key":"420_CR15","volume-title":"Competitive Markov decision processes","author":"J Filar","year":"2012","unstructured":"Filar J, Vrieze K (2012) Competitive Markov decision processes. Springer, Berlin"},{"key":"420_CR16","unstructured":"Greenwald A, Hall K (2003) Correlated-Q learning. In: International conference on machine learning, pp 242\u2013249"},{"issue":"5","key":"420_CR17","doi-asserted-by":"publisher","first-page":"1127","DOI":"10.1111\/1468-0262.00153","volume":"68","author":"S Hart","year":"2000","unstructured":"Hart S, Mas-Colell A (2000) A simple adaptive procedure leading to correlated equilibrium. Econometrica 68(5):1127\u20131150","journal-title":"Econometrica"},{"issue":"6","key":"420_CR18","doi-asserted-by":"publisher","first-page":"644","DOI":"10.1109\/PROC.1980.11718","volume":"68","author":"YC Ho","year":"1980","unstructured":"Ho YC (1980) Team decision theory and information structures. Proc IEEE 68(6):644\u2013654","journal-title":"Proc IEEE"},{"key":"420_CR19","first-page":"1039","volume":"4","author":"J Hu","year":"2003","unstructured":"Hu J, Wellman MP (2003) Nash Q-learning for general-sum stochastic games. J Mach Learn Res 4:1039\u20131069","journal-title":"J Mach Learn Res"},{"issue":"4","key":"420_CR20","first-page":"1563","volume":"11","author":"T Jaksch","year":"2010","unstructured":"Jaksch T, Ortner R, Auer P (2010) Near-optimal regret bounds for reinforcement learning. J Mach Learn Res 11(4):1563\u20131600","journal-title":"J Mach Learn Res"},{"key":"420_CR21","unstructured":"Jin C, Allen-Zhu Z, Bubeck S, Jordan MI (2018) Is Q-learning provably efficient? In: International conference on neural information processing systems, pp 4868\u20134878"},{"issue":"11","key":"420_CR22","doi-asserted-by":"publisher","first-page":"1238","DOI":"10.1177\/0278364913495721","volume":"32","author":"J Kober","year":"2013","unstructured":"Kober J, Bagnell JA, Peters J (2013) Reinforcement learning in robotics: a survey. Int J Robot Res 32(11):1238\u20131274","journal-title":"Int J Robot Res"},{"issue":"2","key":"420_CR23","doi-asserted-by":"publisher","first-page":"495","DOI":"10.1137\/S0363012903437976","volume":"44","author":"DS Leslie","year":"2005","unstructured":"Leslie DS, Collins EJ (2005) Individual Q-learning in normal form games. SIAM J Control Optim 44(2):495\u2013514","journal-title":"SIAM J Control Optim"},{"key":"420_CR24","doi-asserted-by":"crossref","unstructured":"Littman ML (1994) Markov games as a framework for multi-agent reinforcement learning. In: Machine learning, pp 157\u2013163","DOI":"10.1016\/B978-1-55860-335-6.50027-1"},{"key":"420_CR25","unstructured":"Littman ML (2001) Friend-or-Foe Q-learning in general-sum games. In: International conference on machine learning, pp 322\u2013328"},{"key":"420_CR26","unstructured":"Littman ML, Szepesv\u00e1ri C (1996) A generalized reinforcement-learning model: convergence and applications. In: International conference on machine learning, pp 310\u2013318"},{"key":"420_CR27","unstructured":"Liu Q, Yu T, Bai Y, Jin C (2021) A sharp analysis of model-based reinforcement learning with self-play. In: International conference on machine learning. PMLR, pp 7001\u20137010"},{"issue":"3\u20134","key":"420_CR28","doi-asserted-by":"publisher","first-page":"201","DOI":"10.1007\/BF01769190","volume":"7","author":"H Moulin","year":"1978","unstructured":"Moulin H, Vial JP (1978) Strategically zero-sum games: the class of games whose completely mixed equilibria cannot be improved upon. Int J Game Theory 7(3\u20134):201\u2013221","journal-title":"Int J Game Theory"},{"issue":"3","key":"420_CR29","doi-asserted-by":"publisher","first-page":"555","DOI":"10.1109\/TAC.2013.2283743","volume":"59","author":"A Nayyar","year":"2013","unstructured":"Nayyar A, Gupta A, Langbort C, Ba\u015far T (2013) Common information based Markov perfect equilibria for stochastic games with asymmetric information: finite games. IEEE Trans Autom Control 59(3):555\u2013570","journal-title":"IEEE Trans Autom Control"},{"issue":"7","key":"420_CR30","doi-asserted-by":"publisher","first-page":"1644","DOI":"10.1109\/TAC.2013.2239000","volume":"58","author":"A Nayyar","year":"2013","unstructured":"Nayyar A, Mahajan A, Teneketzis D (2013) Decentralized stochastic control with partial history sharing: a common information approach. IEEE Trans Autom Control 58(7):1644\u20131658","journal-title":"IEEE Trans Autom Control"},{"key":"420_CR31","volume-title":"Problem complexity and method efficiency in optimization","author":"AS Nemirovskij","year":"1983","unstructured":"Nemirovskij AS, Yudin DB (1983) Problem complexity and method efficiency in optimization. Wiley-Interscience, New York"},{"key":"420_CR32","first-page":"3168","volume":"28","author":"G Neu","year":"2015","unstructured":"Neu G (2015) Explore no more: improved high-probability regret bounds for non-stochastic bandits. Adv Neural Inf Process Syst 28:3168\u20133176","journal-title":"Adv Neural Inf Process Syst"},{"key":"420_CR33","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1016\/j.tcs.2017.11.021","volume":"716","author":"F Orabona","year":"2018","unstructured":"Orabona F, P\u00e1l D (2018) Scale-free online learning. Theor Comput Sci 716:50\u201369","journal-title":"Theor Comput Sci"},{"issue":"3","key":"420_CR34","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/1379759.1379762","volume":"55","author":"CH Papadimitriou","year":"2008","unstructured":"Papadimitriou CH, Roughgarden T (2008) Computing correlated equilibria in multi-player games. J ACM 55(3):1\u201329","journal-title":"J ACM"},{"key":"420_CR35","unstructured":"P\u00e9rolat J, Strub F, Piot B, Pietquin O (2017) Learning Nash equilibrium for general-sum Markov games from batch data. In: Artificial intelligence and statistics. PMLR, pp 232\u2013241"},{"key":"420_CR36","unstructured":"Prasad H, LA P, Bhatnagar S (2015) Two-timescale algorithms for learning Nash equilibria in general-sum stochastic games. In: International conference on autonomous agents and multiagent systems, pp 1371\u20131379"},{"key":"420_CR37","unstructured":"Shalev-Shwartz S, Shammah S, Shashua A (2016) Safe, multi-agent, reinforcement learning for autonomous driving. arXiv preprint arXiv:1610.03295"},{"issue":"10","key":"420_CR38","doi-asserted-by":"publisher","first-page":"1095","DOI":"10.1073\/pnas.39.10.1953","volume":"39","author":"LS Shapley","year":"1953","unstructured":"Shapley LS (1953) Stochastic games. Proc Natl Acad Sci 39(10):1095\u20131100","journal-title":"Proc Natl Acad Sci"},{"key":"420_CR39","unstructured":"Sidford A, Wang M, Yang L, Ye Y (2020) Solving discounted stochastic two-player games with near-optimal time and sample complexity. In: International conference on artificial intelligence and statistics. PMLR, pp 2992\u20133002"},{"issue":"7587","key":"420_CR40","doi-asserted-by":"publisher","first-page":"484","DOI":"10.1038\/nature16961","volume":"529","author":"D Silver","year":"2016","unstructured":"Silver D, Huang A, Maddison CJ, Guez A, Sifre L, Van Den Driessche G, Schrittwieser J, Antonoglou I, Panneershelvam V, Lanctot M et al (2016) Mastering the game of Go with deep neural networks and tree search. Nature 529(7587):484\u2013489","journal-title":"Nature"},{"key":"420_CR41","unstructured":"Tian Y, Wang Y, Yu T, Sra S (2021) Online learning in unknown Markov games. In: International conference on machine learning"},{"issue":"7782","key":"420_CR42","doi-asserted-by":"publisher","first-page":"350","DOI":"10.1038\/s41586-019-1724-z","volume":"575","author":"O Vinyals","year":"2019","unstructured":"Vinyals O, Babuschkin I, Czarnecki WM, Mathieu M, Dudzik A, Chung J, Choi DH, Powell R, Ewalds T, Georgiev P et al (2019) Grandmaster level in StarCraft II using multi-agent reinforcement learning. Nature 575(7782):350\u2013354","journal-title":"Nature"},{"key":"420_CR43","first-page":"1603","volume":"15","author":"X Wang","year":"2002","unstructured":"Wang X, Sandholm T (2002) Reinforcement learning to play an optimal Nash equilibrium in team Markov games. Adv Neural Inf Process Syst 15:1603\u20131610","journal-title":"Adv Neural Inf Process Syst"},{"key":"420_CR44","unstructured":"Wei CY, Hong YT, Lu CJ (2017) Online reinforcement learning in stochastic games. In: International conference on neural information processing systems, pp 4994\u20135004"},{"key":"420_CR45","unstructured":"Wei CY, Lee CW, Zhang M, Luo H (2021) Last-iterate convergence of decentralized optimistic gradient descent\/ascent in infinite-horizon competitive Markov games. In: Annual conference on learning theory"},{"key":"420_CR46","unstructured":"Xie Q, Chen Y, Wang Z, Yang Z (2020) Learning zero-sum simultaneous-move Markov games using function approximation and correlated equilibrium. In: Conference on learning theory, pp 3674\u20133682"},{"key":"420_CR47","unstructured":"Yongacoglu B, Arslan G, Y\u00fcksel S (2019) Learning team-optimality for decentralized stochastic control and dynamic games. arXiv preprint arXiv:1903.05812"},{"key":"420_CR48","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4614-7085-4","volume-title":"Stochastic networked control systems: stabilization and optimization under information constraints","author":"S Y\u00fcksel","year":"2013","unstructured":"Y\u00fcksel S, Ba\u015far T (2013) Stochastic networked control systems: stabilization and optimization under information constraints. Springer, Berlin"},{"key":"420_CR49","doi-asserted-by":"crossref","unstructured":"Zehfroosh A, Tanner HG (2020) PAC reinforcement learning algorithm for general-sum Markov games. arXiv preprint arXiv:2009.02605","DOI":"10.1109\/MED48518.2020.9182985"},{"key":"420_CR50","unstructured":"Zhao Y, Tian Y, Lee JD, Du SS (2021) Provably efficient policy gradient methods for two-player zero-sum Markov games. arXiv preprint arXiv:2102.08903"},{"key":"420_CR51","unstructured":"Zinkevich M (2003) Online convex programming and generalized infinitesimal gradient ascent. In: International conference on machine learning, pp 928\u2013936"}],"container-title":["Dynamic Games and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13235-021-00420-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13235-021-00420-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13235-021-00420-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,5]],"date-time":"2022-01-05T14:05:23Z","timestamp":1641391523000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13235-021-00420-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,1,5]]},"references-count":51,"alternative-id":["420"],"URL":"https:\/\/doi.org\/10.1007\/s13235-021-00420-0","relation":{},"ISSN":["2153-0785","2153-0793"],"issn-type":[{"type":"print","value":"2153-0785"},{"type":"electronic","value":"2153-0793"}],"subject":[],"published":{"date-parts":[[2022,1,5]]},"assertion":[{"value":"13 December 2021","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 January 2022","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}