{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,26]],"date-time":"2026-03-26T15:28:19Z","timestamp":1774538899838,"version":"3.50.1"},"reference-count":51,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Networking"],"published-print":{"date-parts":[[2021,4]]},"DOI":"10.1109\/tnet.2021.3051663","type":"journal-article","created":{"date-parts":[[2021,1,26]],"date-time":"2021-01-26T20:46:01Z","timestamp":1611693961000},"page":"750-763","source":"Crossref","is-referenced-by-count":17,"title":["Learning to Schedule Network Resources Throughput and Delay Optimally Using Q<sup>+<\/sup>-Learning"],"prefix":"10.1109","volume":"29","author":[{"given":"Jeongmin","family":"Bae","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Joohyun","family":"Lee","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Song","family":"Chong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1214\/aoms\/1177704593"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1287\/moor.1120.0555"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/25.350282"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1561\/2200000024"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1016\/0196-8858(85)90002-8"},{"key":"ref30","first-page":"4863","article-title":"Is Q-learning provably efficient?","author":"jin","year":"2018","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref37","first-page":"2258","article-title":"Efficient average reward reinforcement learning using constant shifting values","author":"yang","year":"2016","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"ref36","first-page":"700","article-title":"Reinforcement learning algorithms for average-payoff Markovian decision processes","volume":"94","author":"singh","year":"1994","journal-title":"Proc AAAI"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1287\/opre.9.3.383"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1561\/1300000001"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1016\/j.tcs.2014.09.029"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/1102351.1102459"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143955"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/9.182479"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.23919\/WiOPT47501.2019.9144097"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICC.2017.7997286"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TVT.2017.2751641"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2018.8485853"},{"key":"ref24","article-title":"On the sample complexity of reinforcement learning","author":"kakade","year":"2003"},{"key":"ref23","article-title":"Experienced deep reinforcement learning with generative adversarial networks (GANs) for model-free ultra reliable low latency communication","author":"kasgari","year":"2019","journal-title":"arXiv 1911 03264"},{"key":"ref26","first-page":"213","article-title":"R-max-a general polynomial time algorithm for near-optimal reinforcement learning","volume":"3","author":"brafman","year":"2002","journal-title":"J Mach Learn Res"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1023\/A:1017984413808"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/TNET.2007.897944"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/TAC.1982.1102980"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/INFCOM.2003.1208724"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TNET.2007.900405"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1287\/moor.18.1.163"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.2006.876219"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2015.2478718"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2016.2611964"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TVT.2017.2762423"},{"key":"ref16","article-title":"Max-weight achieves the exact $[O(1\/V), O(V)]$\n utility-delay tradeoff under Markov dynamics","author":"huang","year":"2010","journal-title":"arXiv 1008 0200"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/MWC.2016.1500356WC"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TCOMM.2018.2850303"},{"key":"ref19","article-title":"Cellular-connected UAVs over 5G: Deep reinforcement learning for interference management","author":"challita","year":"2018","journal-title":"arXiv 1801 05500"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.2200\/S00271ED1V01Y201006CNT007"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/18.212277"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TNET.2012.2191157"},{"key":"ref5","article-title":"Q-learning with UCB exploration is sample efficient for infinite-horizon MDP","author":"dong","year":"2019","journal-title":"arXiv 1901 09311"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/26.780463"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CDC.1990.204000"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TMC.2016.2538238"},{"key":"ref9","article-title":"Dynamic power allocation and routing for satellite and wireless networks with time varying channels","author":"neely","year":"2003"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1994.6.6.1185"},{"key":"ref45","article-title":"Playing atari with deep reinforcement learning","author":"mnih","year":"2013","journal-title":"arXiv 1312 5602"},{"key":"ref48","author":"bae","year":"2020","journal-title":"Learning to Schedule Network Resources Throughput and Delay Optimally Using q+-Learning"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/BF00992698"},{"key":"ref42","volume":"30","author":"hern\u00e1ndez-lerma","year":"2012","journal-title":"Discrete-Time Markov Control Processes Basic Optimality Criteria"},{"key":"ref41","volume":"504","author":"sennott","year":"2009","journal-title":"Stochastic Dynamic Programming and the Control of Queueing Systems"},{"key":"ref44","doi-asserted-by":"crossref","first-page":"529","DOI":"10.1038\/nature14236","article-title":"Human-level control through deep reinforcement learning","volume":"518","author":"mnih","year":"2015","journal-title":"Nature"},{"key":"ref43","volume":"7","author":"altman","year":"1999","journal-title":"Constrained Markov Decision Processes"}],"container-title":["IEEE\/ACM Transactions on Networking"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/90\/9405508\/09336288.pdf?arnumber=9336288","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,9,13]],"date-time":"2021-09-13T20:36:13Z","timestamp":1631565373000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9336288\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,4]]},"references-count":51,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/tnet.2021.3051663","relation":{},"ISSN":["1063-6692","1558-2566"],"issn-type":[{"value":"1063-6692","type":"print"},{"value":"1558-2566","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,4]]}}}