{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T04:31:49Z","timestamp":1780633909319,"version":"3.54.1"},"reference-count":95,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"12","license":[{"start":{"date-parts":[[2022,12,1]],"date-time":"2022-12-01T00:00:00Z","timestamp":1669852800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2022,12,1]],"date-time":"2022-12-01T00:00:00Z","timestamp":1669852800000},"content-version":"am","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2022,12,1]],"date-time":"2022-12-01T00:00:00Z","timestamp":1669852800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,12,1]],"date-time":"2022-12-01T00:00:00Z","timestamp":1669852800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Open Philanthropy Foundation"},{"DOI":"10.13039\/501100000275","name":"Leverhulme Trust","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100000275","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF","doi-asserted-by":"publisher","award":["IIS-1901252"],"award-info":[{"award-number":["IIS-1901252"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF","doi-asserted-by":"publisher","award":["CCF-1909499"],"award-info":[{"award-number":["CCF-1909499"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF","doi-asserted-by":"publisher","award":["DMS-2023505"],"award-info":[{"award-number":["DMS-2023505"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Inform. Theory"],"published-print":{"date-parts":[[2022,12]]},"DOI":"10.1109\/tit.2022.3185139","type":"journal-article","created":{"date-parts":[[2022,6,22]],"date-time":"2022-06-22T19:51:22Z","timestamp":1655927482000},"page":"8156-8196","source":"Crossref","is-referenced-by-count":30,"title":["Bridging Offline Reinforcement Learning and Imitation Learning: A Tale of Pessimism"],"prefix":"10.1109","volume":"68","author":[{"given":"Paria","family":"Rashidinejad","sequence":"first","affiliation":[{"name":"Department of Electrical Engineering and Computer Sciences, University of California at Berkeley (UC Berkeley), Berkeley, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7320-3533","authenticated-orcid":false,"given":"Banghua","family":"Zhu","sequence":"additional","affiliation":[{"name":"Department of Electrical Engineering and Computer Sciences, University of California at Berkeley (UC Berkeley), Berkeley, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2532-0038","authenticated-orcid":false,"given":"Cong","family":"Ma","sequence":"additional","affiliation":[{"name":"Department of Statistics, The University of Chicago, Chicago, IL, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3766-8031","authenticated-orcid":false,"given":"Jiantao","family":"Jiao","sequence":"additional","affiliation":[{"name":"Department of Electrical Engineering and Computer Sciences and the Department of Statistics, University of California at Berkeley (UC Berkeley), Berkeley, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Stuart","family":"Russell","sequence":"additional","affiliation":[{"name":"Department of Electrical Engineering and Computer Sciences, University of California at Berkeley (UC Berkeley), Berkeley, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref73","first-page":"1","article-title":"Critic regularized regression","volume":"33","author":"wang","year":"2020","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref72","article-title":"Way off-policy batch deep reinforcement learning of implicit human preferences in dialog","author":"jaques","year":"2019","journal-title":"arXiv 1907 00456"},{"key":"ref71","article-title":"Behavior regularized offline reinforcement learning","author":"wu","year":"2019","journal-title":"arXiv 1911 11361"},{"key":"ref70","article-title":"RL unplugged: A suite of benchmarks for offline reinforcement learning","author":"gulcehre","year":"2020","journal-title":"arXiv 2006 13888"},{"key":"ref76","first-page":"1","article-title":"GenDICE: Generalized offline estimation of stationary values","author":"zhang","year":"2020","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref77","first-page":"2315","article-title":"DualDICE: Behavior-agnostic estimation of discounted stationary distribution corrections","author":"nachum","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref39","first-page":"21810","article-title":"MOReL: Model-based offline reinforcement learning","volume":"33","author":"kidambi","year":"2020","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref74","article-title":"Benchmarking batch deep reinforcement learning algorithms","author":"fujimoto","year":"2019","journal-title":"arXiv 1910 01708"},{"key":"ref38","first-page":"7677","article-title":"Near-optimal offline reinforcement learning via double variance reduction","volume":"34","author":"yin","year":"2021","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref75","article-title":"AlgaeDICE: Policy gradient from arbitrary experience","author":"nachum","year":"2019","journal-title":"arXiv 1912 02074"},{"key":"ref78","first-page":"11194","article-title":"GradientDICE: Rethinking generalized offline estimation of stationary values","author":"zhang","year":"2020","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref79","first-page":"2818","article-title":"Sample complexity of episodic fixed-horizon reinforcement learning","volume":"2","author":"dann","year":"2015","journal-title":"Proc 28th Int Conf Neural Inf Process Syst"},{"key":"ref33","first-page":"1314","article-title":"Approximate policy iteration schemes: A comparison","author":"scherrer","year":"2014","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref32","article-title":"Finite sample analysis of minimax offline reinforcement learning: Completeness, fast rates and first-order efficiency","author":"uehara","year":"2021","journal-title":"arXiv 2102 02981"},{"key":"ref31","first-page":"4063","article-title":"Sparse feature selection makes batch reinforcement learning more sample efficient","author":"hao","year":"2021","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref30","first-page":"1567","article-title":"Near-optimal provable uniform convergence in offline policy evaluation for reinforcement learning","author":"yin","year":"2021","journal-title":"Proc Int Conf Artif Intell Statist"},{"key":"ref37","first-page":"14129","article-title":"MOPO: Model-based offline policy optimization","volume":"33","author":"yu","year":"2020","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref36","first-page":"5084","article-title":"Is pessimism provably efficient for offline RL?","author":"jin","year":"2021","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref35","first-page":"267","article-title":"Approximately optimal approximate reinforcement learning","volume":"2","author":"kakade","year":"2002","journal-title":"Proc ICML"},{"key":"ref34","first-page":"3205","article-title":"Is the Bellman residual a bad proxy?","author":"geist","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref60","first-page":"1","article-title":"Neural trust region\/proximal policy optimization attains globally optimal policy","author":"liu","year":"2019","journal-title":"Proc Neural Inf Process Syst"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1145\/1102351.1102462"},{"key":"ref61","first-page":"4572","article-title":"Variational policy gradient method for reinforcement learning with general utilities","volume":"33","author":"zhang","year":"2020","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-007-5038-2"},{"key":"ref28","first-page":"1","article-title":"Near-optimal time and sample complexities for solving discounted Markov decision process with a generative model","author":"fsidford","year":"2018","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref64","first-page":"1","article-title":"Fitted Q-iteration in continuous action-space MDPs","author":"antos","year":"2007","journal-title":"Proc Neural Inf Process Syst"},{"key":"ref27","first-page":"11404","article-title":"Batch value-function approximation with only realizability","author":"xie","year":"2021","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref65","first-page":"560","article-title":"Error bounds for approximate policy iteration","author":"munos","year":"2003","journal-title":"Proc 20th Int Conf Mach Learn"},{"key":"ref66","article-title":"A kernel loss for solving the Bellman equation","volume":"32","author":"feng","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref29","first-page":"67","article-title":"Model-based reinforcement learning with a generative model is minimax optimal","author":"agarwal","year":"2020","journal-title":"Proc Conf Learn Theory"},{"key":"ref67","first-page":"9659","article-title":"Minimax weight and Q-function learning for off-policy evaluation","author":"uehara","year":"2020","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref68","first-page":"770","article-title":"Variance reduced value iteration and faster algorithms for solving markov decision processes","author":"sidford","year":"2018","journal-title":"Proceedings of the 5th Annual ACM-SIAM Symposium on Discrete Algorithms"},{"key":"ref69","first-page":"12861","article-title":"Breaking the sample size barrier in model-based reinforcement learning with a generative model","volume":"33","author":"li","year":"2020","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1038\/nature24270"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1038\/nature16961"},{"key":"ref20","first-page":"2052","article-title":"Off-policy deep reinforcement learning without exploration","author":"fujimoto","year":"2019","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref22","first-page":"64","article-title":"Optimality and approximation with policy gradient methods in Markov decision processes","author":"agarwal","year":"2020","journal-title":"Proc Conf Learn Theory"},{"key":"ref21","first-page":"1","article-title":"Stabilizing off-policy Q-learning via bootstrapping error reduction","volume":"32","author":"kumar","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1137\/040614384"},{"key":"ref23","first-page":"1","article-title":"Is a good representation sufficient for sample efficient reinforcement learning?","author":"du","year":"2020","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref26","first-page":"1042","article-title":"Information-theoretic considerations in batch reinforcement learning","author":"chen","year":"2019","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref25","first-page":"1","article-title":"Error propagation for approximate policy and value iteration","author":"farahmand","year":"2010","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref50","first-page":"1","article-title":"The importance of pessimism in fixed-dataset policy optimization","author":"buckman","year":"2020","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref51","author":"kumar","year":"2020","journal-title":"Offline Reinforcement Learning From Algorithms to Practical Challenges"},{"key":"ref95","first-page":"739","article-title":"Estimate of the number of signals in error correcting codes","volume":"117","author":"varshamov","year":"1957","journal-title":"Doklady Akademii Nauk SSSR"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1002\/j.1538-7305.1952.tb01393.x"},{"key":"ref93","author":"mitzenmacher","year":"2017","journal-title":"Probability and Computing Randomization and Probabilistic Techniques in Algorithms and Data Analysis"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.2018.2846245"},{"key":"ref91","author":"robert","year":"2012","journal-title":"Information Theory"},{"key":"ref90","author":"le cam","year":"2012","journal-title":"Asymptotic Methods in Statistical Decision Theory"},{"key":"ref59","article-title":"Batch policy learning in average reward Markov decision processes","author":"liao","year":"2020","journal-title":"arXiv 2007 11771"},{"key":"ref58","first-page":"1","article-title":"Neural policy gradient methods: Global optimality and rates of convergence","author":"wang","year":"2019","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref57","article-title":"On value functions and the agent-environment boundary","author":"jiang","year":"2019","journal-title":"arXiv 1905 13341"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.2200\/S00268ED1V01Y201005AIM009"},{"key":"ref55","first-page":"12287","article-title":"Exponential lower bounds for batch reinforcement learning: Batch RL can be exponentially harder than online RL","author":"zanette","year":"2021","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref54","first-page":"1","article-title":"What are the statistical limits of offline RL with linear function approximation?","author":"wang","year":"2020","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref53","doi-asserted-by":"crossref","first-page":"331","DOI":"10.1016\/S0927-0507(05)80172-0","article-title":"Markov decision processes","volume":"2","author":"puterman","year":"1990","journal-title":"Handbooks in Operations Research and Management Science"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1016\/j.tcs.2010.12.059"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1080\/01621459.2020.1831925"},{"key":"ref40","first-page":"28954","article-title":"COMBO: Conservative offline model-based policy optimization","volume":"34","author":"yu","year":"2021","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.2983149"},{"key":"ref12","article-title":"End to end learning for self-driving cars","author":"bojarski","year":"2016","journal-title":"arXiv 1604 07316 [cs]"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2018.XIV.056"},{"key":"ref14","first-page":"1","article-title":"Learning from logged implicit exploration data","volume":"23","author":"strehl","year":"2010","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/2645710.2645745"},{"key":"ref82","first-page":"4868","article-title":"Is Q-learning provably efficient?","author":"jin","year":"2018","journal-title":"Proc 32nd Int Conf Neural Inf Process Syst"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i2.19104"},{"key":"ref81","first-page":"1704","article-title":"Contextual decision processes with low Bellman rank are PAC-learnable","author":"jiang","year":"2017","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref17","first-page":"661","article-title":"Efficient reductions for imitation learning","author":"ross","year":"2010","journal-title":"Proc 13th Int Conf Artif Intell Statist"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.2022.3162335"},{"key":"ref18","first-page":"2914","article-title":"Toward the fundamental limits of imitation learning","volume":"33","author":"rajaraman","year":"2020","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-013-5368-1"},{"key":"ref19","first-page":"5637","article-title":"WILDS: A benchmark of in-the-wild distribution shifts","author":"koh","year":"2021","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref80","first-page":"1848","article-title":"PAC reinforcement learning with rich observations","author":"krishnamurthy","year":"2016","journal-title":"Proc 30th Int Conf Neural Inf Process Syst"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4612-1880-7_29"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1038\/nature14236"},{"key":"ref3","article-title":"Playing Atari with deep reinforcement learning","author":"mnih","year":"2013","journal-title":"arXiv 1312 5602"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-27645-3_2"},{"key":"ref5","article-title":"D4RL: Datasets for deep data-driven reinforcement learning","author":"fu","year":"2020","journal-title":"arXiv 2004 07219"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-34106-9_26"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3219961"},{"key":"ref49","first-page":"3682","article-title":"EMaQ: Expected-max Q-learning operator for simple yet effective offline and online RL","author":"ghasemipour","year":"2021","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref86","first-page":"578","article-title":"Episodic reinforcement learning in finite MDPs: Minimax lower bounds revisited","author":"domingues","year":"2021","journal-title":"Algorithmic Learning Theory"},{"key":"ref7","article-title":"Offline reinforcement learning: Tutorial, review, and perspectives on open problems","author":"levine","year":"2020","journal-title":"arXiv 2005 01643"},{"key":"ref87","first-page":"2701","article-title":"Minimax-optimal off-policy evaluation with linear function approximation","author":"duan","year":"2020","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref88","first-page":"1","article-title":"Error bounds of imitating policies and environments","volume":"33","author":"xu","year":"2020","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1038\/s41591-018-0310-5"},{"key":"ref46","first-page":"3652","article-title":"Safe policy improvement with baseline bootstrapping","author":"laroche","year":"2019","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref45","first-page":"53","article-title":"Safe policy improvement with soft baseline bootstrapping","author":"nadjahi","year":"2019","journal-title":"Proc Eur Conf Mach Learn Knowl Discovery Databases"},{"key":"ref48","first-page":"1","article-title":"Keep doing what worked: Behavior modelling priors for offline reinforcement learning","author":"siegel","year":"2019","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref47","article-title":"Advantage-weighted regression: Simple and scalable off-policy reinforcement learning","author":"peng","year":"2019","journal-title":"arXiv 1910 00177"},{"key":"ref42","first-page":"1179","article-title":"Conservative Q-learning for offline reinforcement learning","volume":"33","author":"kumar","year":"2020","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref41","first-page":"1264","article-title":"Provably good batch off-policy reinforcement learning without great exploration","volume":"33","author":"liu","year":"2020","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref44","article-title":"Reinforcement learning via Fenchel-Rockafellar duality","author":"nachum","year":"2020","journal-title":"arXiv 2001 01866"},{"key":"ref43","first-page":"104","article-title":"An optimistic perspective on offline reinforcement learning","author":"agarwal","year":"2020","journal-title":"Proc Int Conf Mach Learn"}],"container-title":["IEEE Transactions on Information Theory"],"original-title":[],"link":[{"URL":"https:\/\/ieeexplore.ieee.org\/ielam\/18\/9961124\/9803237-aam.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/18\/9961124\/09803237.pdf?arnumber=9803237","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,12]],"date-time":"2022-12-12T19:12:45Z","timestamp":1670872365000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9803237\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,12]]},"references-count":95,"journal-issue":{"issue":"12"},"URL":"https:\/\/doi.org\/10.1109\/tit.2022.3185139","relation":{},"ISSN":["0018-9448","1557-9654"],"issn-type":[{"value":"0018-9448","type":"print"},{"value":"1557-9654","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,12]]}}}