{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T16:02:20Z","timestamp":1776182540708,"version":"3.50.1"},"reference-count":132,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100000038","name":"Discovery Grant from the Natural Sciences and Engineering Research Council of Canada","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100000038","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Commun. Surv. Tutorials"],"published-print":{"date-parts":[[2021]]},"DOI":"10.1109\/comst.2021.3063822","type":"journal-article","created":{"date-parts":[[2021,3,8]],"date-time":"2021-03-08T21:29:09Z","timestamp":1615238949000},"page":"1226-1252","source":"Crossref","is-referenced-by-count":346,"title":["Single and Multi-Agent Deep Reinforcement Learning for AI-Enabled Wireless Networks: A Tutorial"],"prefix":"10.1109","volume":"23","author":[{"given":"Amal","family":"Feriani","sequence":"first","affiliation":[{"name":"Department of Electrical and Computer Engineering, University of Manitoba, Winnipeg, MB, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5932-6887","authenticated-orcid":false,"given":"Ekram","family":"Hossain","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, University of Manitoba, Winnipeg, MB, Canada"}]}],"member":"263","reference":[{"key":"ref39","article-title":"A machine learning approach to routing","author":"valadarsky","year":"2017"},{"key":"ref38","article-title":"Proximal policy optimization algorithms","author":"schulman","year":"2017"},{"key":"ref33","first-page":"1057","article-title":"Policy gradient methods for reinforcement learning with function approximation","author":"sutton","year":"2000","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TCOMM.2020.3004524"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICC.2019.8762084"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.2968951"},{"key":"ref37","first-page":"1889","article-title":"Trust region policy optimization","author":"schulman","year":"2015","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref36","first-page":"1928","article-title":"Asynchronous methods for deep reinforcement learning","author":"mnih","year":"2016","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref35","article-title":"High-dimensional continuous control using generalized advantage estimation","author":"schulman","year":"2015"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/BF00992696"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-28929-8"},{"key":"ref27","article-title":"Cooperative and distributed reinforcement learning of drones for field coverage","author":"pham","year":"2018"},{"key":"ref29","first-page":"709","article-title":"Dynamic programming for partially observable stochastic games","volume":"4","author":"hansen","year":"2004","journal-title":"Proc AAAI"},{"key":"ref20","article-title":"A deep Q-learning method for downlink power allocation in multi-cell networks","author":"ahmed","year":"2019"},{"key":"ref22","first-page":"1555","article-title":"Predictive representations of state","author":"littman","year":"2002","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/LWC.2019.2948992"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.3004223"},{"key":"ref23","article-title":"Predictive state representations: A new theory for modeling dynamical systems","author":"singh","year":"2012"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737652"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1016\/B978-1-55860-335-6.50027-1"},{"key":"ref100","article-title":"Decentralized multi-agent reinforcement learning with networked agents: Recent advances","author":"zhang","year":"2019"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.39.10.1953"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2019.2921159"},{"key":"ref51","article-title":"Deep reinforcement learning and the deadly triad","author":"van hasselt","year":"2018"},{"key":"ref59","first-page":"5729","article-title":"Hessian aided policy gradient","author":"shen","year":"2019","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref58","article-title":"Stochastic variance-reduced policy gradient","author":"papini","year":"2018"},{"key":"ref57","article-title":"Global convergence of policy gradient methods to (almost) locally optimal policies","author":"zhang","year":"2019"},{"key":"ref56","first-page":"1","article-title":"Finite-time analysis for double Q-learning","volume":"33","author":"xiong","year":"2020","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref55","first-page":"10555","article-title":"A finite-time analysis of Q-learning with neural network function approximation","author":"xu","year":"2020","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref54","first-page":"1125","article-title":"SBEED: Convergent reinforcement learning with nonlinear function approximation","author":"dai","year":"2018","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref53","first-page":"1458","article-title":"Learning from conditional distributions via dual embeddings","author":"dai","year":"2017","journal-title":"Proc Artif Intell Stat"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1016\/B978-1-55860-377-6.50013-X"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/GLOBECOM38437.2019.9013115"},{"key":"ref4","first-page":"265","article-title":"TensorFlow: A system for large-scale machine learning","author":"abadi","year":"2016","journal-title":"Proc of USENIX Symp on Operating Systems Design and Implementation (OSDI)"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.3019590"},{"key":"ref6","first-page":"195","article-title":"Planning, learning and coordination in multiagent decision processes","author":"boutilier","year":"1996","journal-title":"Proc 6th Conf Theor Aspects Rational Knowl"},{"key":"ref5","first-page":"8024","article-title":"PyTorch: An imperative style, high-performance deep learning library","author":"paszke","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/1516512.1516516"},{"key":"ref49","article-title":"Soft actor-critic: Off-policy maximum entropy deep reinforcement learning with a stochastic actor","author":"haarnoja","year":"2018"},{"key":"ref7","author":"shoham","year":"2003","journal-title":"Multi-agent reinforcement learning A critical survey"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/COMST.2019.2916583"},{"key":"ref46","first-page":"387","article-title":"Deterministic policy gradient algorithms","author":"silver","year":"2014","journal-title":"Proc 31st Int Conf Int Conf Mach Learn"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/TCCN.2018.2809722"},{"key":"ref48","article-title":"Addressing function approximation error in actor-critic methods","author":"fujimoto","year":"2018"},{"key":"ref47","article-title":"Continuous control with deep reinforcement learning","author":"lillicrap","year":"2015"},{"key":"ref42","article-title":"Playing atari with deep reinforcement learning","author":"mnih","year":"2013"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/BF00992698"},{"key":"ref44","first-page":"1995","article-title":"Dueling network architectures for deep reinforcement learning","author":"wang","year":"2016","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref43","article-title":"Prioritized experience replay","author":"schaul","year":"2015"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/TWC.2020.3024860"},{"key":"ref126","article-title":"Double-IRS assisted multi-user MIMO: Cooperative passive beamforming design","author":"zheng","year":"2020"},{"key":"ref125","article-title":"Secure beamforming for distributed intelligent reflecting surfaces aided mmWave systems","author":"xiu","year":"2020"},{"key":"ref124","article-title":"Intelligent reflecting surface assisted anti-jamming communications: A fast reinforcement learning approach","author":"yang","year":"2020"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1007\/BF00993104"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1016\/B978-1-55860-141-3.50030-4"},{"key":"ref129","first-page":"11327","article-title":"Privacy-preserving Q-learning with functional noise in continuous spaces","author":"wang","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref71","first-page":"465","article-title":"PILCO: A model-based and data-efficient approach to policy search","author":"deisenroth","year":"2011","journal-title":"Proc 28th Int Conf Mach Learn (ICML)"},{"key":"ref128","article-title":"Exploiting multiple intelligent reflecting surfaces in multi-cell uplink MIMO communications","author":"kim","year":"2020"},{"key":"ref70","first-page":"5842","article-title":"Composable planning with attributes","author":"zhang","year":"2018","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref76","first-page":"12519","article-title":"When to trust your model: Model-based policy optimization","author":"janner","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2017\/525"},{"key":"ref77","first-page":"34","article-title":"Improving PILCO with bayesian neural network dynamics models","volume":"4","author":"gal","year":"2016","journal-title":"Proc Data Efficient Mach Learn Workshop"},{"key":"ref74","article-title":"Benchmarking model-based reinforcement learning","author":"langlois","year":"2019"},{"key":"ref75","article-title":"Tutorial on model-based methods in reinforcement learning","author":"mordatch","year":"2020","journal-title":"Proc ICML"},{"key":"ref131","article-title":"Adversarial policies: Attacking deep reinforcement learning","author":"gleave","year":"2019"},{"key":"ref78","first-page":"4754","article-title":"Deep reinforcement learning in a handful of trials using probabilistic dynamics models","author":"chua","year":"2018","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref132","article-title":"Robust adversarial reinforcement learning","author":"pinto","year":"2017"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1007\/s10846-017-0468-y"},{"key":"ref60","first-page":"541","article-title":"An improved convergence analysis of stochastic variance-reduced policy gradient","author":"xu","year":"2020","journal-title":"Proc Conf Uncertainty of Artificial Intelligence"},{"key":"ref62","article-title":"Neural policy gradient methods: Global optimality and rates of convergence","author":"wang","year":"2019"},{"key":"ref61","article-title":"Neural proximal\/trust region policy optimization attains globally optimal policy","author":"liu","year":"2019"},{"key":"ref63","article-title":"Improving sample complexity bounds for actor-critic algorithms","author":"xu","year":"2020"},{"key":"ref64","article-title":"A finite time analysis of two time-scale actor critic methods","author":"wu","year":"2020"},{"key":"ref65","article-title":"Non-asymptotic convergence analysis of two time-scale (natural) actor-critic algorithms","author":"xu","year":"2020"},{"key":"ref66","first-page":"3127","article-title":"Convergent policy optimization for safe reinforcement learning","author":"yu","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-57321-8_5"},{"key":"ref68","article-title":"Explainability in deep reinforcement learning","author":"heuillet","year":"2020"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.3010896"},{"key":"ref69","first-page":"14322","article-title":"When to use parametric models in reinforcement learning?","author":"van hasselt","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/MNET.001.1900287"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.3010329"},{"key":"ref95","article-title":"QMIX: Monotonic value function factorisation for deep multi-agent reinforcement learning","author":"rashid","year":"2018"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/IEEECONF44664.2019.9049050"},{"key":"ref94","first-page":"2085","article-title":"Value-decomposition networks for cooperative multi-agent learning based on team reward","author":"sunehag","year":"2018","journal-title":"Proc AAMAS"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.23919\/JCC.2020.09.017"},{"key":"ref93","first-page":"6379","article-title":"Multi-agent actor-critic for mixed cooperative-competitive environments","author":"lowe","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref106","article-title":"Decentralized computation offloading for multi-user mobile edge computing: A deep reinforcement learning approach","author":"chen","year":"2018"},{"key":"ref92","article-title":"On the pitfalls of measuring emergent communication","author":"lowe","year":"2019"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.3390\/app10144735"},{"key":"ref91","article-title":"Learning efficient multi-agent communication: An information bottleneck approach","author":"wang","year":"2019"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/COMST.2020.3008362"},{"key":"ref90","article-title":"Learning multi-agent communication under limited-bandwidth restriction for Internet packet routing","author":"mao","year":"2019"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1016\/j.comnet.2020.107496"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/COMST.2019.2943405"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/CCNC.2019.8651796"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2943253"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/78"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-71682-4_5"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/CDC.2018.8619581"},{"key":"ref96","article-title":"The representational capacity of action-value networks for multi-agent reinforcement learning","author":"castellini","year":"2019"},{"key":"ref97","first-page":"7613","article-title":"MAVEN: Multi-agent variational exploration","author":"mahajan","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref10","doi-asserted-by":"crossref","first-page":"30","DOI":"10.23919\/JCIN.2019.8917870","article-title":"Survey on reinforcement learning applications in communication networks","volume":"4","author":"qian","year":"2019","journal-title":"J Commun Inf Netw"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/COMST.2020.2988367"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPAASC47483.2019.9023331"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/MNET.2018.1800109"},{"key":"ref14","article-title":"6G white paper on machine learning in wireless communication networks","author":"ali","year":"2020"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2019.2954595"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/TVT.2019.2897134"},{"key":"ref16","article-title":"A tutorial of ultra-reliable and low-latency communications in 6G: Integrating theoretical knowledge into deep learning","author":"she","year":"2020"},{"key":"ref82","first-page":"2172","article-title":"Infogan: Interpretable representation learning by information maximizing generative adversarial nets","author":"chen","year":"2016","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/TVT.2019.2961405"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/IWCMC.2019.8766739"},{"key":"ref81","first-page":"776","article-title":"Multi-agent reinforcement learning with multi-step generative models","author":"krupnik","year":"2020","journal-title":"Proc Conf Robot Learn"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2020.2976000"},{"key":"ref84","article-title":"How you act tells a lot: Privacy-leakage attack on deep reinforcement learning","author":"pan","year":"2019"},{"key":"ref119","article-title":"Reinforcement learning for mitigating intermittent interference in Terahertz communicatin networks","author":"barazideh","year":"2020"},{"key":"ref19","author":"sutton","year":"2018","journal-title":"Reinforcement Learning An Introduction"},{"key":"ref83","article-title":"A survey of learning in multiagent environments: Dealing with non-stationarity","author":"hernandez-leal","year":"2017"},{"key":"ref114","first-page":"1169","article-title":"Application of deep reinforcement learning to UAV fleet control","author":"to\u017ei?ka","year":"2018","journal-title":"Proc SAI Intell Syst Conf"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/ICCW.2019.8756984"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2018.2890210"},{"key":"ref80","article-title":"Model-based multi-agent reinforcement learning with cooperative prioritized sweeping","author":"bargiacchi","year":"2020"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2020.3018825"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1145\/3411295.3411317"},{"key":"ref89","first-page":"7254","article-title":"Learning attentional communication for multi-agent cooperation","author":"jiang","year":"2018","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/TSUSC.2020.3025139"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/TWC.2020.3003719"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/VTC2020-Spring48590.2020.9129476"},{"key":"ref85","article-title":"Multi-agent reinforcement learning: A selective overview of theories and algorithms","author":"zhang","year":"2019"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1016\/B978-1-55860-307-3.50049-6"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1007\/s10458-019-09421-1"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2020.2977374"}],"container-title":["IEEE Communications Surveys &amp; Tutorials"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9739\/9438976\/09372298.pdf?arnumber=9372298","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,5,5]],"date-time":"2023-05-05T17:46:26Z","timestamp":1683308786000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9372298\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":132,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/comst.2021.3063822","relation":{},"ISSN":["1553-877X","2373-745X"],"issn-type":[{"value":"1553-877X","type":"electronic"},{"value":"2373-745X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]}}}