{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,26]],"date-time":"2025-08-26T06:49:04Z","timestamp":1756190944648},"reference-count":30,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2023,11,17]],"date-time":"2023-11-17T00:00:00Z","timestamp":1700179200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,11,17]],"date-time":"2023-11-17T00:00:00Z","timestamp":1700179200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Stat Comput"],"published-print":{"date-parts":[[2024,2]]},"DOI":"10.1007\/s11222-023-10351-y","type":"journal-article","created":{"date-parts":[[2023,11,17]],"date-time":"2023-11-17T09:02:44Z","timestamp":1700211764000},"update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Off-policy evaluation for tabular reinforcement learning with synthetic trajectories"],"prefix":"10.1007","volume":"34","author":[{"given":"Weiwei","family":"Wang","sequence":"first","affiliation":[]},{"given":"Yuqiang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Xianyi","family":"Wu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,11,17]]},"reference":[{"key":"10351_CR1","unstructured":"Afsar, M. M., Crump, T., Far, B.: Reinforcement learning based recommender systems: a survey (2021). arXiv:2101.06286"},{"key":"10351_CR2","doi-asserted-by":"crossref","unstructured":"Bassen, J., Balaji, B., Schaarschmidt, M., Thille, C., Painter, J., Zimmaro, D., Mitchell, J. C.: Reinforcement learning for the adaptive scheduling of educational activities. In: Proceedings of the 2020 CHI Conference on Human Factors in Computing Systems, pp. 1\u201312 (2020)","DOI":"10.1145\/3313831.3376518"},{"issue":"4","key":"10351_CR3","first-page":"1","volume":"5","author":"O Chapelle","year":"2014","unstructured":"Chapelle, O., Manavoglu, E., Rosales, R.: Simple and scalable response prediction for display advertising. ACM Trans. Intell. Syst. Technol. (TIST) 5(4), 1\u201334 (2014)","journal-title":"ACM Trans. Intell. Syst. Technol. (TIST)"},{"issue":"6","key":"10351_CR4","doi-asserted-by":"publisher","first-page":"2092","DOI":"10.1016\/j.eswa.2012.10.014","volume":"40","author":"FA Dorca","year":"2013","unstructured":"Dorca, F.A., Lima, L.V., Fernandes, M.A., Lopes, C.R.: Comparing strategies for modeling students learning styles through reinforcement learning in adaptive and intelligent educational systems: An experimental analysis. Expert Syst. Appl. 40(6), 2092\u20132101 (2013)","journal-title":"Expert Syst. Appl."},{"key":"10351_CR5","unstructured":"Dudik, M., Langford, J., Li, L.: Doubly robust policy evaluation and learning (2011). arXiv:1103.4601"},{"key":"10351_CR6","unstructured":"Farajtabar, M., Chow, Y., Ghavamzadeh, M.: More robust doubly robust off-policy evaluation. In: International Conference on Machine Learning, pp. 1447\u20131456 (2018)"},{"key":"10351_CR7","unstructured":"Fonteneau R., Murphy, S., Wehenkel, L., Ernst, D.: Model-free Monte Carlo-like policy evaluation. In: Proceedings of the 13th International Conference on Artificial Intelligence and Statistics, pp. 217\u2013224 (2010)"},{"key":"10351_CR8","doi-asserted-by":"publisher","first-page":"106706","DOI":"10.1016\/j.knosys.2020.106706","volume":"213","author":"L Huang","year":"2021","unstructured":"Huang, L., Fu, M., Li, F., Qu, H., Liu, Y., Chen, W.: A deep reinforcement learning based long-term recommender system. Knowl. Based Syst. 213, 106706 (2021)","journal-title":"Knowl. Based Syst."},{"key":"10351_CR9","unstructured":"Jiang, N., Li, L.: Doubly robust off-policy value evaluation for reinforcement learning. In: Proceedings of the 33rd International Conference on Machine Learning, pp. 652\u2013661 (2016)"},{"key":"10351_CR10","unstructured":"Levine, S., Kumar, A., Tucker, G., Fu, J.: Offline reinforcement learning: Tutorial, review, and perspectives on open problems (2020). arXiv:2005.01643"},{"key":"10351_CR11","doi-asserted-by":"crossref","unstructured":"Li, L., Chu, W., Langford, J., Wang, X.: Unbiased offline evaluation of contextual-bandit-based news article recommendation algorithms. In: Proceedings of the fourth ACM International Conference on Web Search and Data Mining, pp. 297\u2013306 (2011)","DOI":"10.1145\/1935826.1935878"},{"key":"10351_CR12","unstructured":"Liu, Q., Li, L., Tang, Z., Zhou, D.: Breaking the curse of horizon: infinite-horizon off-policy estimation (2018). arXiv:1810.12429"},{"issue":"530","key":"10351_CR13","doi-asserted-by":"publisher","first-page":"692","DOI":"10.1080\/01621459.2018.1537919","volume":"115","author":"DJ Luckett","year":"2020","unstructured":"Luckett, D.J., Laber, E.B., Kahkoska, A.R., Maahs, D.M., Mayer-Davis, E., Kosorok, M.R.: Estimating dynamic treatment regimes in mobile health using v-learning. J. Am. Stat. Assoc. 115(530), 692\u2013706 (2020)","journal-title":"J. Am. Stat. Assoc."},{"key":"10351_CR14","unstructured":"Mandel, T., Liu, Y. E., Levine, S., Brunskill, E., Popovic, Z.: Offline policy evaluation across representations with applications to educational games. In: AAMAS, pp. 1077\u20131084 (2014)"},{"issue":"2","key":"10351_CR15","doi-asserted-by":"publisher","first-page":"308","DOI":"10.1287\/mnsc.1060.0614","volume":"53","author":"S Mannor","year":"2007","unstructured":"Mannor, S., Simester, D., Sun, P., Tsitsiklis, J.N.: Bias and variance approximation in value function estimates. Manag. Sci. 53(2), 308\u2013322 (2007)","journal-title":"Manag. Sci."},{"issue":"2","key":"10351_CR16","doi-asserted-by":"publisher","first-page":"331","DOI":"10.1111\/1467-9868.00389","volume":"65","author":"SA Murphy","year":"2003","unstructured":"Murphy, S.A.: Optimal dynamic treatment regimes. J. R. Stat. Soc. Ser. B (Stat. Methodol.) 65(2), 331\u2013355 (2003)","journal-title":"J. R. Stat. Soc. Ser. B (Stat. Methodol.)"},{"issue":"456","key":"10351_CR17","doi-asserted-by":"publisher","first-page":"1410","DOI":"10.1198\/016214501753382327","volume":"96","author":"SA Murphy","year":"2001","unstructured":"Murphy, S.A., van der Laan, M.J., Robins, J.M.: Conduct problems prevention research group. Marginal mean models for dynamic regimes. J. Am. Stat. Assoc. 96(456), 1410\u20131423 (2001)","journal-title":"J. Am. Stat. Assoc."},{"key":"10351_CR18","unstructured":"Precup, D., Sutton, R., Singh, S.: Eligibility traces for off-policy policy evaluation. In: Proceedings of the 17th International Conference on Machine Learning, pp. 759\u2013766 (2000)"},{"key":"10351_CR19","unstructured":"Sridharan, K.: A gentle introduction to concentration inequalities. Department of Computer Science - Cornell University, Tech. Rep(2002)"},{"key":"10351_CR20","volume-title":"Reinforcement Learning: An Introduction","author":"RS Sutton","year":"2018","unstructured":"Sutton, R.S., Barto, A.G.: Reinforcement Learning: An Introduction. MIT Press, Cambridge (2018)"},{"key":"10351_CR21","doi-asserted-by":"crossref","unstructured":"Theocharous, G., Thomas, P. S., Ghavamzadeh, M.: Ad recommendation systems for lifetime value optimization. In: Proceedings of the 24th International Conference on World Wide Web, pp. 1305\u20131310 (2015)","DOI":"10.1145\/2740908.2741998"},{"key":"10351_CR22","unstructured":"Thomas, P., Brunskill, E.: Data-efficient off-policy policy evaluation for reinforcement learning. In: Proceedings of the 33rd International Conference on Machine Learning, pp. 2139\u20132148 (2016)"},{"key":"10351_CR23","doi-asserted-by":"crossref","unstructured":"Thomas, P.S., Theocharous, G., Ghavamzadeh, M., Durugkar, I., Brunskill, E.: Predictive off-policy policy evaluation for nonstationary decision problems, with applications to digital marketing. In: AAAI, pp. 4740\u20134745 (2017)","DOI":"10.1609\/aaai.v31i2.19104"},{"key":"10351_CR24","unstructured":"Thomas, P.S.: Safe reinforcement learning. University of Massachusetts Libraries (2015)"},{"key":"10351_CR25","unstructured":"Xie, T., Ma, Y., Wang, Y. X.: Towards optimal off-policy evaluation for reinforcement learning with marginalized importance sampling (2019). arXiv:1906.03393"},{"key":"10351_CR26","unstructured":"Yin, M., Wang, Y. X.: Asymptotically efficient off-policy evaluation for tabular reinforcement learning. In: International Conference on Artificial Intelligence and Statistics, pp. 3948\u20133958 (2020)"},{"key":"10351_CR27","first-page":"14129","volume":"33","author":"T Yu","year":"2020","unstructured":"Yu, T., Thomas, G., Yu, L., Ermon, S., Zou, J.Y., Levine, S., Ma, T.: Mopo: model-based offline policy optimization. Adv. Neural Inf. Process. Syst. 33, 14129\u201314142 (2020)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10351_CR28","doi-asserted-by":"crossref","unstructured":"Zhan, X., Zhu, X., Xu, H.: Model-based offline planning with trajectory pruning (2021). arXiv:2105.07351","DOI":"10.24963\/ijcai.2022\/516"},{"issue":"26","key":"10351_CR29","doi-asserted-by":"publisher","first-page":"3294","DOI":"10.1002\/sim.3720","volume":"28","author":"Y Zhao","year":"2009","unstructured":"Zhao, Y., Kosorok, M.R., Zeng, D.: Reinforcement learning design for cancer clinical trials. Stat. Med. 28(26), 3294\u20133315 (2009)","journal-title":"Stat. Med."},{"key":"10351_CR30","doi-asserted-by":"crossref","unstructured":"Zou, L., Xia, L., Ding, Z., Song, J., Liu, W., Yin, D. Reinforcement learning to optimize long-term user engagement in recommender systems. In: Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp. 2810\u20132818 (2019)","DOI":"10.1145\/3292500.3330668"}],"container-title":["Statistics and Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11222-023-10351-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11222-023-10351-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11222-023-10351-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,20]],"date-time":"2024-01-20T10:20:14Z","timestamp":1705746014000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11222-023-10351-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11,17]]},"references-count":30,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2024,2]]}},"alternative-id":["10351"],"URL":"https:\/\/doi.org\/10.1007\/s11222-023-10351-y","relation":{},"ISSN":["0960-3174","1573-1375"],"issn-type":[{"value":"0960-3174","type":"print"},{"value":"1573-1375","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,11,17]]},"assertion":[{"value":"11 December 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 October 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 November 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"41"}}