{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T13:52:37Z","timestamp":1774360357601,"version":"3.50.1"},"reference-count":51,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,3,9]],"date-time":"2022-03-09T00:00:00Z","timestamp":1646784000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,3,9]],"date-time":"2022-03-09T00:00:00Z","timestamp":1646784000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF","doi-asserted-by":"publisher","award":["CMMI-2038625"],"award-info":[{"award-number":["CMMI-2038625"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,3,9]]},"DOI":"10.1109\/ciss53076.2022.9751189","type":"proceedings-article","created":{"date-parts":[[2022,4,14]],"date-time":"2022-04-14T19:40:07Z","timestamp":1649965207000},"page":"107-112","source":"Crossref","is-referenced-by-count":2,"title":["The RBMLE method for Reinforcement Learning"],"prefix":"10.1109","author":[{"given":"Akshay","family":"Mete","sequence":"first","affiliation":[{"name":"Texas A &#x0026; M University,Department of ECE,College Station,TX,USA"}]},{"given":"Rahul","family":"Singh","sequence":"additional","affiliation":[{"name":"Indian Institute of Science,Department of ECE,Bengaluru,India"}]},{"given":"P. R.","family":"Kumar","sequence":"additional","affiliation":[{"name":"Texas A &#x0026; M University,Department of ECE,College Station,TX,USA"}]}],"member":"263","reference":[{"key":"ref39","first-page":"1583","article-title":"Learning to optimize via information-directed sampling","volume":"27","author":"russo","year":"2014","journal-title":"ADVANCES IN NEURAL IN-FORMATION PROCESSING SYSTEMS"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ALLERTON.2010.5706896"},{"key":"ref33","first-page":"1246","article-title":"Thompson sampling for linear-quadratic control problems","author":"abeille","year":"2017","journal-title":"Artificial Intelli- gence and Statistics"},{"key":"ref32","first-page":"861","article-title":"Thompson sampling for learning parameterized markov decision processes","author":"gopalan","year":"0","journal-title":"Conference on Learning Theory"},{"key":"ref31","first-page":"592","article-title":"On bayesian upper confidence bounds for bandit problems","author":"kaufmann","year":"2012","journal-title":"Artificial Intelligence and Statistics"},{"key":"ref30","first-page":"39","article-title":"Analysis of thompson sam-pling for the multi-armed bandit problem","author":"agrawal","year":"0","journal-title":"Conference on Learning Theory JMLR Workshop and Conference Proceedings"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TNET.2021.3063626"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TAC.2018.2874671"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1002\/9780470980033"},{"key":"ref34","first-page":"6248","article-title":"Exploration through reward biasing: Reward-biased maximum likelihood estimation for stochastic multi-armed bandits","author":"liu","year":"0","journal-title":"International Conference on Machine Learning"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1093\/biomet\/25.3-4.285"},{"key":"ref27","article-title":"Learning in Markov decision processes under constraints","author":"singh","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1090\/S0002-9904-1952-09620-8"},{"key":"ref2","author":"astrom","year":"2013","journal-title":"Adaptive Control"},{"key":"ref1","volume":"73","author":"tsypkin","year":"1971","journal-title":"Adaptation and Learning in Automatic Systems"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1023\/A:1013689704352"},{"key":"ref22","first-page":"49","article-title":"Logarithmic online regret bounds for undiscounted reinforcement learning","author":"auer","year":"2007","journal-title":"Advances in neural information processing systems"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/1772690.1772758"},{"key":"ref24","article-title":"Gaussian process optimization in the bandit setting: No regret and experimental design","author":"srinivas","year":"2009","journal-title":"ArXiv Preprint"},{"key":"ref23","article-title":"Near-optimal regret bounds for reinforcement learning","volume":"11","author":"jaksch","year":"2010","journal-title":"Journal of Machine Learning Research"},{"key":"ref26","first-page":"620","article-title":"Constrained upper confidence reinforcement learning","author":"zheng","year":"2020","journal-title":"Learning for Dynamics and Control"},{"key":"ref25","first-page":"1","article-title":"Regret bounds for the adaptive control of linear quadratic systems","author":"abbasi-yadkori","year":"0","journal-title":"Proceedings of the 24th Annual Conference on Learning Theory JMLR Workshop and Conference Proceedings"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.4310\/CIS.2006.v6.n4.a3"},{"key":"ref51","article-title":"Augmented RBMLE-UCB Approach for Adaptive Control of Linear Quadratic Systems","author":"mete","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1017\/9781108571401"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TAC.1979.1102191"},{"key":"ref40","article-title":"Reward-biased maximum likelihood estimation for lin-ear stochastic bandits","author":"hung","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TAC.1982.1102878"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TAC.1982.1103017"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TAC.1983.1103122"},{"key":"ref15","first-page":"81","article-title":"Optimal strategies for the n-armed bandit problem","author":"becker","year":"1981","journal-title":"Univ Maryland Baltimore County Math Res Rep"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/BF00939540"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/BF00939938"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/BF02193097"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1016\/0196-8858(85)90002-8"},{"key":"ref4","author":"kumar","year":"1986","journal-title":"Stochastic Systems Estimation Identification and Adaptive Control"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1137\/0323023"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/B978-0-08-042375-3.50010-X"},{"key":"ref5","author":"goodwin","year":"2014","journal-title":"Adaptive Filtering Prediction and Control"},{"key":"ref8","volume":"42","author":"hernandez-lerma","year":"2012","journal-title":"Further Topics on Discrete-Time Markov Control Processes"},{"key":"ref7","author":"sutton","year":"2018","journal-title":"Reinforcement Learning An Introduction"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1137\/S0363012999366369"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.2307\/1426206"},{"key":"ref46","article-title":"Learning unknown Markov Decision Processes: A Thompson Sampling approach","author":"ouyang","year":"2017","journal-title":"ArXiv Preprint"},{"key":"ref45","first-page":"815","article-title":"Reward biased maximum likelihood estimation for reinforce-ment learning","author":"mete","year":"2021","journal-title":"Learning for Dynamics and Control"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1137\/S0363012997317499"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1137\/0321009"},{"key":"ref42","first-page":"208","article-title":"Contextual bandits with linear payoff functions","author":"chu","year":"0","journal-title":"Proceedings of the fourteenth international conference on artificial intelligence and statistics JMLR Workshop and Conference Proceedings"},{"key":"ref41","article-title":"Gaussian process optimization in the bandit setting: No regret and experimental design","author":"srinivas","year":"2009","journal-title":"ArXiv Preprint"},{"key":"ref44","article-title":"(more) efficient reinforcement learning via posterior sampling","author":"osband","year":"2013","journal-title":"Advances in neural information processing systems"},{"key":"ref43","first-page":"89","article-title":"Near-optimal regret bounds for reinforcement learning","author":"auer","year":"2009","journal-title":"Advances in Neu-ral Information Processing Systems"}],"event":{"name":"2022 56th Annual Conference on Information Sciences and Systems (CISS)","location":"Princeton, NJ, USA","start":{"date-parts":[[2022,3,9]]},"end":{"date-parts":[[2022,3,11]]}},"container-title":["2022 56th Annual Conference on Information Sciences and Systems (CISS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9751119\/9751149\/09751189.pdf?arnumber=9751189","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,6,15]],"date-time":"2022-06-15T20:15:25Z","timestamp":1655324125000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9751189\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,3,9]]},"references-count":51,"URL":"https:\/\/doi.org\/10.1109\/ciss53076.2022.9751189","relation":{},"subject":[],"published":{"date-parts":[[2022,3,9]]}}}