{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T20:31:44Z","timestamp":1757622704028,"version":"3.44.0"},"publisher-location":"Cham","reference-count":46,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030864859"},{"type":"electronic","value":"9783030864866"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-86486-6_11","type":"book-chapter","created":{"date-parts":[[2021,9,9]],"date-time":"2021-09-09T15:25:48Z","timestamp":1631201148000},"page":"174-189","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Model-Based Offline Policy Optimization with Distribution Correcting Regularization"],"prefix":"10.1007","author":[{"given":"Jian","family":"Shen","sequence":"first","affiliation":[]},{"given":"Mingcheng","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Zhicheng","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Zhengyu","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Weinan","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Yong","family":"Yu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,9,10]]},"reference":[{"unstructured":"Agarwal, R., Schuurmans, D., Norouzi, M.: Striving for simplicity in off-policy deep reinforcement learning (2019)","key":"11_CR1"},{"doi-asserted-by":"crossref","unstructured":"Chen, H., et al.: Large-scale interactive recommendation with tree-structured policy gradient. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 33, pp. 3312\u20133320 (2019)","key":"11_CR2","DOI":"10.1609\/aaai.v33i01.33013312"},{"doi-asserted-by":"crossref","unstructured":"Chen, M., Beutel, A., Covington, P., Jain, S., Belletti, F., Chi, E.H.: Top-k off-policy correction for a reinforce recommender system. In: Proceedings of the Twelfth ACM International Conference on Web Search and Data Mining, pp. 456\u2013464 (2019)","key":"11_CR3","DOI":"10.1145\/3289600.3290999"},{"unstructured":"Chua, K., Calandra, R., McAllister, R., Levine, S.: Deep reinforcement learning in a handful of trials using probabilistic dynamics models. In: Advances in Neural Information Processing Systems, pp. 4754\u20134765 (2018)","key":"11_CR4"},{"doi-asserted-by":"crossref","unstructured":"Covington, P., Adams, J., Sargin, E.: Deep neural networks for Youtube recommendations. In: Proceedings of the 10th ACM Conference on Recommender Systems, pp. 191\u2013198 (2016)","key":"11_CR5","DOI":"10.1145\/2959100.2959190"},{"unstructured":"Fu, J., Kumar, A., Nachum, O., Tucker, G., Levine, S.: D4rl: datasets for deep data-driven reinforcement learning. arXiv preprint arXiv:2004.07219 (2020)","key":"11_CR6"},{"unstructured":"Fujimoto, S., Meger, D., Precup, D.: Off-policy deep reinforcement learning without exploration. In: International Conference on Machine Learning, pp. 2052\u20132062. PMLR (2019)","key":"11_CR7"},{"unstructured":"Gottesman, O., et al.: Evaluating reinforcement learning algorithms in observational health settings. Computing Research Repository (CoRR) (2018)","key":"11_CR8"},{"issue":"Mar","key":"11_CR9","first-page":"723","volume":"13","author":"A Gretton","year":"2012","unstructured":"Gretton, A., Borgwardt, K.M., Rasch, M.J., Sch\u00f6lkopf, B., Smola, A.: A kernel two-sample test. J. Mach. Learn. Res. 13(Mar), 723\u2013773 (2012)","journal-title":"J. Mach. Learn. Res."},{"unstructured":"Haarnoja, T., Zhou, A., Abbeel, P., Levine, S.: Soft actor-critic: off-policy maximum entropy deep reinforcement learning with a stochastic actor. arXiv preprint arXiv:1801.01290 (2018)","key":"11_CR10"},{"doi-asserted-by":"crossref","unstructured":"Hessel, M., et al.: Rainbow: combining improvements in deep reinforcement learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 32 (2018)","key":"11_CR11","DOI":"10.1609\/aaai.v32i1.11796"},{"unstructured":"Ho, J., Ermon, S.: Generative adversarial imitation learning. In: Advances in Neural Information Processing Systems, pp. 4565\u20134573 (2016)","key":"11_CR12"},{"unstructured":"Janner, M., Fu, J., Zhang, M., Levine, S.: When to trust your model: model-based policy optimization. In: Advances in Neural Information Processing Systems, pp. 12498\u201312509 (2019)","key":"11_CR13"},{"unstructured":"Jaques, N., et al.: Way off-policy batch deep reinforcement learning of implicit human preferences in dialog. arXiv preprint arXiv:1907.00456 (2019)","key":"11_CR14"},{"unstructured":"Karampatziakis, N., Kochman, S., Huang, J., Mineiro, P., Osborne, K., Chen, W.: Lessons from real-world reinforcement learning in a customer support bot. Computing Research Repository (CoRR) (2019)","key":"11_CR15"},{"unstructured":"Kidambi, R., Rajeswaran, A., Netrapalli, P., Joachims, T.: Morel: model-based offline reinforcement learning. arXiv preprint arXiv:2005.05951 (2020)","key":"11_CR16"},{"issue":"4","key":"11_CR17","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1007\/s13748-016-0094-0","volume":"5","author":"B Krawczyk","year":"2016","unstructured":"Krawczyk, B.: Learning from imbalanced data: open challenges and future directions. Progr, Artif. Intell. 5(4), 221\u2013232 (2016). https:\/\/doi.org\/10.1007\/s13748-016-0094-0","journal-title":"Progr, Artif. Intell."},{"unstructured":"Kumar, A., Fu, J., Tucker, G., Levine, S.: Stabilizing off-policy Q-learning via bootstrapping error reduction. arXiv preprint arXiv:1906.00949 (2019)","key":"11_CR18"},{"unstructured":"Kumar, A., Zhou, A., Tucker, G., Levine, S.: Conservative Q-learning for offline reinforcement learning. arXiv preprint arXiv:2006.04779 (2020)","key":"11_CR19"},{"key":"11_CR20","doi-asserted-by":"publisher","first-page":"45","DOI":"10.1007\/978-3-642-27645-3_2","volume-title":"Reinforcement Learning","author":"S Lange","year":"2012","unstructured":"Lange, S., Gabel, T., Riedmiller, M.: Batch reinforcement learning. In: Wiering, M., van Otterlo, M. (eds.) Reinforcement Learning, vol. 12, pp. 45\u201373. Springer, Heidelberg (2012). https:\/\/doi.org\/10.1007\/978-3-642-27645-3_2"},{"unstructured":"Levine, S., Kumar, A., Tucker, G., Fu, J.: Offline reinforcement learning: tutorial, review, and perspectives on open problems. arXiv preprint arXiv:2005.01643 (2020)","key":"11_CR21"},{"unstructured":"Lillicrap, T.P., et al.: Continuous control with deep reinforcement learning. arXiv preprint arXiv:1509.02971 (2015)","key":"11_CR22"},{"unstructured":"Luo, Y., Xu, H., Li, Y., Tian, Y., Darrell, T., Ma, T.: Algorithmic framework for model-based deep reinforcement learning with theoretical guarantees. arXiv preprint arXiv:1807.03858 (2018)","key":"11_CR23"},{"issue":"2","key":"11_CR24","doi-asserted-by":"publisher","first-page":"429","DOI":"10.2307\/1428011","volume":"29","author":"A M\u00fcller","year":"1997","unstructured":"M\u00fcller, A.: Integral probability metrics and their generating classes of functions. Adv. Appl. Probab. 29(2), 429\u2013443 (1997)","journal-title":"Adv. Appl. Probab."},{"unstructured":"Nachum, O., Chow, Y., Dai, B., Li, L.: DualDICE: behavior-agnostic estimation of discounted stationary distribution corrections. arXiv preprint arXiv:1906.04733 (2019)","key":"11_CR25"},{"unstructured":"Nachum, O., Dai, B., Kostrikov, I., Chow, Y., Li, L., Schuurmans, D.: AlgaeDICE: policy gradient from arbitrary experience. arXiv preprint arXiv:1912.02074 (2019)","key":"11_CR26"},{"doi-asserted-by":"crossref","unstructured":"Nagabandi, A., Kahn, G., Fearing, R.S., Levine, S.: Neural network dynamics for model-based deep reinforcement learning with model-free fine-tuning. In: 2018 IEEE International Conference on Robotics and Automation (ICRA), pp. 7559\u20137566. IEEE (2018)","key":"11_CR27","DOI":"10.1109\/ICRA.2018.8463189"},{"unstructured":"Peng, X.B., Kumar, A., Zhang, G., Levine, S.: Advantage-weighted regression: simple and scalable off-policy reinforcement learning. arXiv preprint arXiv:1910.00177 (2019)","key":"11_CR28"},{"key":"11_CR29","doi-asserted-by":"publisher","DOI":"10.1515\/9781400873173","volume-title":"Convex Analysis","author":"RT Rockafellar","year":"1970","unstructured":"Rockafellar, R.T.: Convex Analysis, vol. 36. Princeton University Press, Princeton (1970)"},{"issue":"19","key":"11_CR30","doi-asserted-by":"publisher","first-page":"70","DOI":"10.2352\/ISSN.2470-1173.2017.19.AVM-023","volume":"2017","author":"AE Sallab","year":"2017","unstructured":"Sallab, A.E., Abdou, M., Perot, E., Yogamani, S.: Deep reinforcement learning framework for autonomous driving. Electron. Imaging 2017(19), 70\u201376 (2017)","journal-title":"Electron. Imaging"},{"unstructured":"Shen, J., Zhao, H., Zhang, W., Yu, Y.: Model-based policy optimization with unsupervised model adaptation. In: Advances in Neural Information Processing Systems, vol. 33 (2020)","key":"11_CR31"},{"unstructured":"Siegel, N.Y., et al.: Keep doing what worked: behavioral modelling priors for offline reinforcement learning. arXiv preprint arXiv:2002.08396 (2020)","key":"11_CR32"},{"issue":"7587","key":"11_CR33","doi-asserted-by":"publisher","first-page":"484","DOI":"10.1038\/nature16961","volume":"529","author":"D Silver","year":"2016","unstructured":"Silver, D., et al.: Mastering the game of go with deep neural networks and tree search. Nature 529(7587), 484\u2013489 (2016)","journal-title":"Nature"},{"unstructured":"Strehl, A., Langford, J., Kakade, S., Li, L.: Learning from logged implicit exploration data. Computing Research Repository (CoRR) (2010)","key":"11_CR34"},{"issue":"1","key":"11_CR35","first-page":"1731","volume":"16","author":"A Swaminathan","year":"2015","unstructured":"Swaminathan, A., Joachims, T.: Batch learning from logged bandit feedback through counterfactual risk minimization. J. Mach. Learn. Res. 16(1), 1731\u20131755 (2015)","journal-title":"J. Mach. Learn. Res."},{"doi-asserted-by":"crossref","unstructured":"Swazinna, P., Udluft, S., Runkler, T.: Overcoming model bias for robust offline deep reinforcement learning. arXiv preprint arXiv:2008.05533 (2020)","key":"11_CR36","DOI":"10.1016\/j.engappai.2021.104366"},{"key":"11_CR37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-71050-9","volume-title":"Optimal Transport: Old and New","author":"C Villani","year":"2008","unstructured":"Villani, C.: Optimal Transport: Old and New, vol. 338. Springer, Heidelberg (2008). https:\/\/doi.org\/10.1007\/978-3-540-71050-9"},{"doi-asserted-by":"crossref","unstructured":"Wang, L., Zhang, W., He, X., Zha, H.: Supervised reinforcement learning with recurrent neural network for dynamic treatment recommendation. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp. 2447\u20132456 (2018)","key":"11_CR38","DOI":"10.1145\/3219819.3219961"},{"unstructured":"Wang, Q., Xiong, J., Han, L., Sun, P., Liu, H., Zhang, T.: Exponentially weighted imitation learning for batched historical data. In: NeurIPS, pp. 6291\u20136300 (2018)","key":"11_CR39"},{"unstructured":"Wu, Y., Tucker, G., Nachum, O.: Behavior regularized offline reinforcement learning. arXiv preprint arXiv:1911.11361 (2019)","key":"11_CR40"},{"doi-asserted-by":"crossref","unstructured":"Yu, C., Ren, G., Liu, J.: Deep inverse reinforcement learning for sepsis treatment. In: 2019 IEEE International Conference on Healthcare Informatics (ICHI), pp. 1\u20133. IEEE (2019)","key":"11_CR41","DOI":"10.1109\/ICHI.2019.8904645"},{"unstructured":"Yu, T., et al.: MOPO: model-based offline policy optimization. arXiv preprint arXiv:2005.13239 (2020)","key":"11_CR42"},{"doi-asserted-by":"crossref","unstructured":"Zadrozny, B.: Learning and evaluating classifiers under sample selection bias. In: Proceedings of the Twenty-First International Conference on Machine Learning, p. 114 (2004)","key":"11_CR43","DOI":"10.1145\/1015330.1015425"},{"unstructured":"Zhang, R., Dai, B., Li, L., Schuurmans, D.: GenDICE: generalized offline estimation of stationary values. arXiv preprint arXiv:2002.09072 (2020)","key":"11_CR44"},{"unstructured":"Zhang, S., Liu, B., Whiteson, S.: GradientDICE: rethinking generalized offline estimation of stationary values. In: International Conference on Machine Learning, pp. 11194\u201311203. PMLR (2020)","key":"11_CR45"},{"unstructured":"Zhou, L., Small, K., Rokhlenko, O., Elkan, C.: End-to-end offline goal-oriented dialog policy learning via policy gradient. Computing Research Repository (CoRR) (2017)","key":"11_CR46"}],"container-title":["Lecture Notes in Computer Science","Machine Learning and Knowledge Discovery in Databases. Research Track"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-86486-6_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,8]],"date-time":"2025-09-08T22:05:55Z","timestamp":1757369155000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-86486-6_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030864859","9783030864866"],"references-count":46,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-86486-6_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"10 September 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECML PKDD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Joint European Conference on Machine Learning and Knowledge Discovery in Databases","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Bilbao","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Spain","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 September 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17 September 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecml2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/2021.ecmlpkdd.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"869","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"210","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"24% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3-4","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3-9","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held online due to the COVID-19 pandemic.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}