{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,28]],"date-time":"2025-03-28T09:00:07Z","timestamp":1743152407848,"version":"3.40.3"},"publisher-location":"Cham","reference-count":37,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031434204"},{"type":"electronic","value":"9783031434211"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-43421-1_27","type":"book-chapter","created":{"date-parts":[[2023,9,17]],"date-time":"2023-09-17T20:37:24Z","timestamp":1694983044000},"page":"455-471","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Offline Reinforcement Learning with\u00a0On-Policy Q-Function Regularization"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4038-8620","authenticated-orcid":false,"given":"Laixi","family":"Shi","sequence":"first","affiliation":[]},{"given":"Robert","family":"Dadashi","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6766-5459","authenticated-orcid":false,"given":"Yuejie","family":"Chi","sequence":"additional","affiliation":[]},{"given":"Pablo Samuel","family":"Castro","sequence":"additional","affiliation":[]},{"given":"Matthieu","family":"Geist","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,9,18]]},"reference":[{"key":"27_CR1","doi-asserted-by":"crossref","unstructured":"Arulkumaran, K., Deisenroth, M.P., Brundage, M., Bharath, A.A.: A brief survey of deep reinforcement learning. arXiv preprint arXiv:1708.05866 (2017)","DOI":"10.1109\/MSP.2017.2743240"},{"key":"27_CR2","unstructured":"Bradbury, J., et al.: Jax: Autograd and xla. Astrophysics Source Code Library pp. ascl-2111 (2021)"},{"key":"27_CR3","first-page":"4933","volume":"34","author":"D Brandfonbrener","year":"2021","unstructured":"Brandfonbrener, D., Whitney, W., Ranganath, R., Bruna, J.: Offline RL without off-policy evaluation. Adv. Neural Inf. Process. Syst. 34, 4933\u20134946 (2021)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"27_CR4","unstructured":"Buckman, J., Gelada, C., Bellemare, M.G.: The importance of pessimism in fixed-dataset policy optimization. In: International Conference on Learning Representations (2020)"},{"key":"27_CR5","first-page":"15084","volume":"34","author":"L Chen","year":"2021","unstructured":"Chen, L., et al.: Decision transformer: reinforcement learning via sequence modeling. Adv. Neural Inf. Process. Syst. 34, 15084\u201315097 (2021)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"27_CR6","first-page":"18353","volume":"33","author":"X Chen","year":"2020","unstructured":"Chen, X., Zhou, Z., Wang, Z., Wang, C., Wu, Y., Ross, K.: Bail: best-action imitation learning for batch deep reinforcement learning. Adv. Neural Inf. Process. Syst. 33, 18353\u201318363 (2020)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"27_CR7","unstructured":"Dadashi, R., Rezaeifar, S., Vieillard, N., Hussenot, L., Pietquin, O., Geist, M.: Offline reinforcement learning with pseudometric learning. In: International Conference on Machine Learning, pp. 2307\u20132318. PMLR (2021)"},{"key":"27_CR8","first-page":"11260","volume":"34","author":"R Fakoor","year":"2021","unstructured":"Fakoor, R., Mueller, J.W., Asadi, K., Chaudhari, P., Smola, A.J.: Continuous doubly constrained batch reinforcement learning. Adv. Neural Inf. Process. Syst. 34, 11260\u201311273 (2021)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"27_CR9","unstructured":"Fu, J., Kumar, A., Nachum, O., Tucker, G., Levine, S.: D4rl: datasets for deep data-driven reinforcement learning. arXiv preprint arXiv:2004.07219 (2020)"},{"key":"27_CR10","first-page":"20132","volume":"34","author":"S Fujimoto","year":"2021","unstructured":"Fujimoto, S., Gu, S.S.: A minimalist approach to offline reinforcement learning. Adv. Neural Inf. Process. Syst. 34, 20132\u201320145 (2021)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"27_CR11","unstructured":"Fujimoto, S., Hoof, H., Meger, D.: Addressing function approximation error in actor-critic methods. In: International Conference on Machine Learning, pp. 1587\u20131596. PMLR (2018)"},{"key":"27_CR12","unstructured":"Fujimoto, S., Meger, D., Precup, D.: Off-policy deep reinforcement learning without exploration. In: International Conference on Machine Learning, pp. 2052\u20132062. PMLR (2019)"},{"key":"27_CR13","unstructured":"Garg, D., Hejna, J., Geist, M., Ermon, S.: Extreme q-learning: maxent RL without entropy. arXiv preprint arXiv:2301.02328 (2023)"},{"key":"27_CR14","unstructured":"Ghasemipour, S.K.S., Schuurmans, D., Gu, S.S.: EMaQ: expected-max Q-learning operator for simple yet effective offline and online RL. In: International Conference on Machine Learning, pp. 3682\u20133691. PMLR (2021)"},{"key":"27_CR15","unstructured":"Gulcehre, C., et al.: Regularized behavior value estimation. arXiv preprint arXiv:2103.09575 (2021)"},{"key":"27_CR16","unstructured":"Haarnoja, T., Zhou, A., Abbeel, P., Levine, S.: Soft actor-critic: off-policy maximum entropy deep reinforcement learning with a stochastic actor. In: International Conference on Machine Learning, pp. 1861\u20131870. PMLR (2018)"},{"issue":"7825","key":"27_CR17","doi-asserted-by":"publisher","first-page":"357","DOI":"10.1038\/s41586-020-2649-2","volume":"585","author":"CR Harris","year":"2020","unstructured":"Harris, C.R., et al.: Array programming with numpy. Nature 585(7825), 357\u2013362 (2020)","journal-title":"Nature"},{"key":"27_CR18","unstructured":"Hoffman, M., et al.: Acme: a research framework for distributed reinforcement learning. arXiv preprint arXiv:2006.00979 (2020)"},{"key":"27_CR19","first-page":"1273","volume":"34","author":"M Janner","year":"2021","unstructured":"Janner, M., Li, Q., Levine, S.: Offline reinforcement learning as one big sequence modeling problem. Adv. Neural Inf. Process. Syst. 34, 1273\u20131286 (2021)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"27_CR20","unstructured":"Kostrikov, I., Fergus, R., Tompson, J., Nachum, O.: Offline reinforcement learning with fisher divergence critic regularization. In: International Conference on Machine Learning, pp. 5774\u20135783. PMLR (2021)"},{"key":"27_CR21","unstructured":"Kostrikov, I., Nair, A., Levine, S.: Offline reinforcement learning with implicit Q-learning. arXiv preprint arXiv:2110.06169 (2021)"},{"key":"27_CR22","unstructured":"Kumar, A., Fu, J., Soh, M., Tucker, G., Levine, S.: Stabilizing off-policy Q-learning via bootstrapping error reduction. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"27_CR23","first-page":"1179","volume":"33","author":"A Kumar","year":"2020","unstructured":"Kumar, A., Zhou, A., Tucker, G., Levine, S.: Conservative Q-learning for offline reinforcement learning. Adv. Neural Inf. Process. Syst. 33, 1179\u20131191 (2020)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"27_CR24","unstructured":"Lee, B.J., Lee, J., Kim, K.E.: Representation balancing offline model-based reinforcement learning. In: International Conference on Learning Representations (2020)"},{"key":"27_CR25","unstructured":"Levine, S.: Reinforcement learning and control as probabilistic inference: tutorial and review. arXiv preprint arXiv:1805.00909 (2018)"},{"key":"27_CR26","unstructured":"Levine, S., Kumar, A., Tucker, G., Fu, J.: Offline reinforcement learning: tutorial, review, and perspectives on open problems. arXiv preprint arXiv:2005.01643 (2020)"},{"key":"27_CR27","first-page":"1711","volume":"35","author":"J Lyu","year":"2022","unstructured":"Lyu, J., Ma, X., Li, X., Lu, Z.: Mildly conservative Q-learning for offline reinforcement learning. Adv. Neural Inf. Process. Syst. 35, 1711\u20131724 (2022)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"27_CR28","unstructured":"Peng, X.B., Kumar, A., Zhang, G., Levine, S.: Advantage-weighted regression: simple and scalable off-policy reinforcement learning. arXiv preprint arXiv:1910.00177 (2019)"},{"key":"27_CR29","doi-asserted-by":"crossref","unstructured":"Rezaeifar, S., et al.: Offline reinforcement learning as anti-exploration. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, pp. 8106\u20138114 (2022)","DOI":"10.1609\/aaai.v36i7.20783"},{"key":"27_CR30","doi-asserted-by":"crossref","unstructured":"Silver, D., et al.: Mastering the game of go without human knowledge. Nature 550(7676), 354\u2013359 (2017)","DOI":"10.1038\/nature24270"},{"issue":"7782","key":"27_CR31","doi-asserted-by":"publisher","first-page":"350","DOI":"10.1038\/s41586-019-1724-z","volume":"575","author":"O Vinyals","year":"2019","unstructured":"Vinyals, O., et al.: Grandmaster level in starcraft ii using multi-agent reinforcement learning. Nature 575(7782), 350\u2013354 (2019)","journal-title":"Nature"},{"key":"27_CR32","unstructured":"Wang, Z., Hunt, J.J., Zhou, M.: Diffusion policies as an expressive policy class for offline reinforcement learning. arXiv preprint arXiv:2208.06193 (2022)"},{"key":"27_CR33","first-page":"7768","volume":"33","author":"Z Wang","year":"2020","unstructured":"Wang, Z., et al.: Critic regularized regression. Adv. Neural Inf. Process. Syst. 33, 7768\u20137778 (2020)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"27_CR34","unstructured":"Wu, Y., Tucker, G., Nachum, O.: Behavior regularized offline reinforcement learning. arXiv preprint arXiv:1911.11361 (2019)"},{"key":"27_CR35","unstructured":"Yang, S., Wang, Z., Zheng, H., Feng, Y., Zhou, M.: A regularized implicit policy for offline reinforcement learning. arXiv preprint arXiv:2202.09673 (2022)"},{"key":"27_CR36","first-page":"28954","volume":"34","author":"T Yu","year":"2021","unstructured":"Yu, T., Kumar, A., Rafailov, R., Rajeswaran, A., Levine, S., Finn, C.: Combo: conservative offline model-based policy optimization. Adv. Neural Inf. Process. Syst. 34, 28954\u201328967 (2021)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"27_CR37","doi-asserted-by":"crossref","unstructured":"Zhang, G., Kashima, H.: Behavior estimation from multi-source data for offline reinforcement learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 37, pp. 11201\u201311209 (2023)","DOI":"10.1609\/aaai.v37i9.26326"}],"container-title":["Lecture Notes in Computer Science","Machine Learning and Knowledge Discovery in Databases: Research Track"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-43421-1_27","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,17]],"date-time":"2023-09-17T20:45:58Z","timestamp":1694983558000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-43421-1_27"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031434204","9783031434211"],"references-count":37,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-43421-1_27","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"18 September 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"Offline RL methods may bring benefits for social application scenarios when collecting new data is infeasible due to cost, privacy or safety. For example, learning to diagnose from historical medical records or designing recommendations given existing clicking records of some advertisements. For negative social impact, offline methods may enable big data discriminatory pricing to yield unfair market or improve the recommendation techniques to make more people to be addicted to the social media. However, our proposed methods is more related to introducing scientific thoughts and investigations, which do not target such possible applications. Additionally, this work will only use public benchmarks and data, so no personal data will be acquired or inferred.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical Statement"}},{"value":"ECML PKDD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Joint European Conference on Machine Learning and Knowledge Discovery in Databases","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Turin","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 September 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"22 September 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecml2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/2023.ecmlpkdd.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"829","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"196","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"24% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.63","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4.5","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Applied Data Science Track: 239 submissions, 58 accepted papers; Demo Track: 31 submissions, 16 accepted papers.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}