{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T11:46:53Z","timestamp":1769514413067,"version":"3.49.0"},"reference-count":37,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100003696","name":"Electronics and Telecommunications Research Institute","doi-asserted-by":"publisher","award":["19YE1410"],"award-info":[{"award-number":["19YE1410"]}],"id":[{"id":"10.13039\/501100003696","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000181","name":"AFOSR","doi-asserted-by":"publisher","award":["FA2386-19-1-4020"],"award-info":[{"award-number":["FA2386-19-1-4020"]}],"id":[{"id":"10.13039\/100000181","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Korean Government through the Ministry of Science and ICT","award":["NRF-2019R1F1A1060752"],"award-info":[{"award-number":["NRF-2019R1F1A1060752"]}]},{"name":"Korean Government through the Ministry of Science and ICT","award":["NRF-2021R1A6A1A03039493"],"award-info":[{"award-number":["NRF-2021R1A6A1A03039493"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2021]]},"DOI":"10.1109\/access.2021.3113350","type":"journal-article","created":{"date-parts":[[2021,9,16]],"date-time":"2021-09-16T20:14:23Z","timestamp":1631823263000},"page":"129728-129741","source":"Crossref","is-referenced-by-count":16,"title":["QSOD: Hybrid Policy Gradient for Deep Multi-agent Reinforcement Learning"],"prefix":"10.1109","volume":"9","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2230-6927","authenticated-orcid":false,"given":"Hafiz Muhammad Raza Ur","family":"Rehman","sequence":"first","affiliation":[{"name":"Department of Information and Communication Engineering, Yeungnam University, Gyeongsan, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6929-3188","authenticated-orcid":false,"given":"Byung-Won","family":"On","sequence":"additional","affiliation":[{"name":"Department of Software Convergence Engineering, Kunsan National University, Gunsan, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Devarani Devi","family":"Ningombam","sequence":"additional","affiliation":[{"name":"Planning Division, Electronics and Telecommunications Research Institute, Daejeon, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sungwon","family":"Yi","sequence":"additional","affiliation":[{"name":"Planning Division, Electronics and Telecommunications Research Institute, Daejeon, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0854-768X","authenticated-orcid":false,"given":"Gyu Sang","family":"Choi","sequence":"additional","affiliation":[{"name":"Department of Information and Communication Engineering, Yeungnam University, Gyeongsan, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1111\/1467-9213.00311"},{"key":"ref32","first-page":"464","article-title":"Multi-agent reinforcement learning in sequential social dilemmas","author":"leibo","year":"2017","journal-title":"Proc 16th Conf Auto Agents MultiAgent Syst"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2941229"},{"key":"ref30","first-page":"455","article-title":"Reinforcement learning and function approximation","author":"irodova","year":"2005","journal-title":"Proc FLAIRS Conf"},{"key":"ref37","first-page":"5887","article-title":"Qtran: Learning to factorize with transformation for cooperative multi-agent reinforcement learning","author":"son","year":"2019","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref36","article-title":"QVMix and QVMix-max: Extending the deep quality-value family of algorithms to cooperative multi-agent reinforcement learning","author":"leroy","year":"2020","journal-title":"arXiv 2012 12062"},{"key":"ref35","article-title":"QPLEX: Duplex dueling multi-agent Q-learning","author":"wang","year":"2020","journal-title":"arXiv 2008 01062"},{"key":"ref34","first-page":"7611","article-title":"MAVEN: MultiAgent variational exploration","author":"mahajan","year":"2019","journal-title":"Proc 32nd Int Conf Neural Inf Process Syst"},{"key":"ref10","first-page":"9924","article-title":"Multi-agent common knowledge reinforcement learning","volume":"32","author":"witt","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref11","first-page":"1","article-title":"Value-decomposition networks for cooperative multi-agent learning based on team reward","author":"sunehag","year":"2017","journal-title":"Proc 17th Int Conf Auton Agents MultiAgent Syst"},{"key":"ref12","article-title":"HyperNetworks","author":"ha","year":"2017","journal-title":"Proc Int Conf Learn Represent (ICLR)"},{"key":"ref13","first-page":"4292","article-title":"QMIX: Monotonic value function factorisation for deep multi-agent reinforcement learning","author":"rashid","year":"2018","journal-title":"Proc 35th Int Conf Mach Learn"},{"key":"ref14","article-title":"StarCraft II: A new challenge for reinforcement learning","author":"vinyals","year":"2017","journal-title":"arXiv 1708 04782"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1155\/2016\/7950348"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TSMCC.2007.913919"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0172395"},{"key":"ref18","article-title":"Learning from delayed rewards","author":"watkins","year":"1989"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1038\/nature14236"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2016.2634548"},{"key":"ref4","first-page":"2","article-title":"Guided deep&#x2019; reinforcement learning for swarm systems","author":"huttenrauch","year":"2017","journal-title":"Proc AAMAS Auton Robots Multirobot Syst (ARMS) Workshop"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-014-0645-7"},{"key":"ref3","article-title":"Multiagent reinforcement learning for multirobot systems: A survey","author":"yang","year":"2004"},{"key":"ref6","article-title":"Multiagent bidirectionally-coordinated nets: Emergence of human-level coordination in learning to play StarCraft combat games","author":"peng","year":"2017","journal-title":"arXiv 1703 10069"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1016\/j.advengsoft.2013.12.007"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1016\/B978-1-55860-307-3.50049-6"},{"key":"ref8","first-page":"1","article-title":"Learning to play guess who? and inventing a grounded language as a consequence","author":"jorge","year":"2016","journal-title":"Proc NIPS Workshop Deep Reinforcement Learn"},{"key":"ref7","first-page":"1146","article-title":"Stabilising experience replay for deep multi-agent reinforcement learning","author":"foerster","year":"2017","journal-title":"Proc the 34th Int Conf Mach Learn"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2005.04.039"},{"key":"ref9","first-page":"1","article-title":"Counterfactual multiagent policy gradients","author":"foerster","year":"2018","journal-title":"Proc 32nd AAAI Conf Artif Intell"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TII.2012.2219061"},{"key":"ref20","first-page":"2681","article-title":"Deep decentralized multi-task multi-agent RL under partial observability","author":"omidshafiei","year":"2017","journal-title":"Proc 34th Int Conf Mach Learn"},{"key":"ref22","first-page":"1789","article-title":"Collaborative multiagent reinforcement learning by payoff propagation","volume":"7","author":"kok","year":"2006","journal-title":"J Mach Learn Res"},{"key":"ref21","first-page":"1523","article-title":"Multiagent planning with factored MDPs","author":"guestrin","year":"2002","journal-title":"Advances in neural information processing systems"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-71682-4_5"},{"key":"ref23","first-page":"2244","article-title":"Learning multiagent communication with backpropagation","author":"sukhbaatar","year":"2016","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref26","article-title":"Episodic exploration for deep deterministic policies: An application to StarCraft micromanagement tasks","author":"usunier","year":"0","journal-title":"Proc Int Conf Learn Represent (ICLR)"},{"key":"ref25","first-page":"6382","article-title":"Multi-agent actor-critic for mixed cooperative-competitive environments","author":"lowe","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6287639\/9312710\/09540595.pdf?arnumber=9540595","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T17:42:11Z","timestamp":1745343731000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9540595\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":37,"URL":"https:\/\/doi.org\/10.1109\/access.2021.3113350","relation":{},"ISSN":["2169-3536"],"issn-type":[{"value":"2169-3536","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]}}}