{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,27]],"date-time":"2025-09-27T00:06:28Z","timestamp":1758931588586,"version":"3.44.0"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032060952","type":"print"},{"value":"9783032060969","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,9,27]],"date-time":"2025-09-27T00:00:00Z","timestamp":1758931200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,27]],"date-time":"2025-09-27T00:00:00Z","timestamp":1758931200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-06096-9_29","type":"book-chapter","created":{"date-parts":[[2025,9,26]],"date-time":"2025-09-26T09:55:27Z","timestamp":1758880527000},"page":"502-518","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Aggressive Exploration in\u00a0Offline Reinforcement Learning for\u00a0Better Recommendations"],"prefix":"10.1007","author":[{"given":"Kexin","family":"Shi","sequence":"first","affiliation":[]},{"given":"Wenjia","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Bingyi","family":"Jing","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,9,27]]},"reference":[{"key":"29_CR1","unstructured":"Barto, A.G.: Reinforcement learning: an introduction. by richard\u2019s sutton. SIAM Rev 6(2), 423 (2021)"},{"key":"29_CR2","doi-asserted-by":"crossref","unstructured":"Chaney, A.J., Stewart, B.M., Engelhardt, B.E.: How algorithmic confounding in recommendation systems increases homogeneity and decreases utility. In: Proceedings of the 12th ACM Conference on Recommender Systems, pp. 224\u2013232 (2018)","DOI":"10.1145\/3240323.3240370"},{"issue":"3","key":"29_CR3","first-page":"1","volume":"41","author":"J Chen","year":"2023","unstructured":"Chen, J., Dong, H., Wang, X., Feng, F., Wang, M., He, X.: Bias and debias in recommender system: a survey and future directions. ACM Trans. Inf. Syst. 41(3), 1\u201339 (2023)","journal-title":"ACM Trans. Inf. Syst."},{"key":"29_CR4","doi-asserted-by":"crossref","unstructured":"Chen, M., Beutel, A., Covington, P., Jain, S., Belletti, F., Chi, E.H.: Top-k off-policy correction for a reinforce recommender system. In: Proceedings of the Twelfth ACM International Conference on Web Search and Data Mining, pp. 456\u2013464 (2019)","DOI":"10.1145\/3289600.3290999"},{"key":"29_CR5","unstructured":"Deisenroth, M., Rasmussen, C.E.: PILCO: a model-based and data-efficient approach to policy search. In: Proceedings of the 28th International Conference on machine learning (ICML-11), pp. 465\u2013472 (2011)"},{"key":"29_CR6","unstructured":"Fujimoto, S., Meger, D., Precup, D.: Off-policy deep reinforcement learning without exploration. In: International Conference on Machine Learning, pp. 2052\u20132062. PMLR (2019)"},{"key":"29_CR7","doi-asserted-by":"crossref","unstructured":"Gao, C., et al.: Alleviating matthew effect of offline reinforcement learning in interactive recommendation. In: Proceedings of the 46th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 238\u2013248 (2023)","DOI":"10.1145\/3539618.3591636"},{"key":"29_CR8","doi-asserted-by":"crossref","unstructured":"Gao, C., Li, S., Lei, W., Chen, J., Li, B., Jiang, P., He, X., Mao, J., Chua, T.S.: Kuairec: A fully-observed dataset and insights for evaluating recommender systems. In: Proceedings of the 31st ACM International Conference on Information & Knowledge Management. pp. 540\u2013550 (2022)","DOI":"10.1145\/3511808.3557220"},{"key":"29_CR9","doi-asserted-by":"crossref","unstructured":"Gao, C., et al.: Kuairand: an unbiased sequential recommendation dataset with randomly exposed videos. In: Proceedings of the 31st ACM International Conference on Information & Knowledge Management, pp. 3953\u20133957 (2022)","DOI":"10.1145\/3511808.3557624"},{"issue":"1","key":"29_CR10","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3594871","volume":"42","author":"C Gao","year":"2023","unstructured":"Gao, C., et al.: CIRS: bursting filter bubbles by counterfactual interactive recommender system. ACM Trans. Inf. Syst. 42(1), 1\u201327 (2023)","journal-title":"ACM Trans. Inf. Syst."},{"key":"29_CR11","doi-asserted-by":"crossref","unstructured":"Guo, H., Tang, R., Ye, Y., Li, Z., He, X.: DeepFM: a factorization-machine based neural network for CTR prediction. arXiv preprint arXiv:1703.04247 (2017)","DOI":"10.24963\/ijcai.2017\/239"},{"key":"29_CR12","doi-asserted-by":"crossref","unstructured":"Hu, Y., Koren, Y., Volinsky, C.: Collaborative filtering for implicit feedback datasets. In: 2008 Eighth IEEE International Conference on Data Mining, pp. 263\u2013272. IEEE (2008)","DOI":"10.1109\/ICDM.2008.22"},{"key":"29_CR13","unstructured":"Janner, M., Fu, J., Zhang, M., Levine, S.: When to trust your model: model-based policy optimization. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"29_CR14","first-page":"21810","volume":"33","author":"R Kidambi","year":"2020","unstructured":"Kidambi, R., Rajeswaran, A., Netrapalli, P., Joachims, T.: Morel: model-based offline reinforcement learning. Adv. Neural. Inf. Process. Syst. 33, 21810\u201321823 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"29_CR15","unstructured":"Kostrikov, I., Nair, A., Levine, S.: Offline reinforcement learning with implicit q-learning. arXiv preprint arXiv:2110.06169 (2021)"},{"key":"29_CR16","unstructured":"Kumar, A., Fu, J., Soh, M., Tucker, G., Levine, S.: Stabilizing off-policy q-learning via bootstrapping error reduction. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"29_CR17","first-page":"1179","volume":"33","author":"A Kumar","year":"2020","unstructured":"Kumar, A., Zhou, A., Tucker, G., Levine, S.: Conservative q-learning for offline reinforcement learning. Adv. Neural. Inf. Process. Syst. 33, 1179\u20131191 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"issue":"1","key":"29_CR18","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1016\/0196-8858(85)90002-8","volume":"6","author":"TL Lai","year":"1985","unstructured":"Lai, T.L., Robbins, H.: Asymptotically efficient adaptive allocation rules. Adv. Appl. Math. 6(1), 4\u201322 (1985)","journal-title":"Adv. Appl. Math."},{"key":"29_CR19","doi-asserted-by":"crossref","unstructured":"Marlin, B.M., Zemel, R.S.: Collaborative prediction and ranking with non-random missing data. In: Proceedings of the third ACM Conference on Recommender Systems, pp. 5\u201312 (2009)","DOI":"10.1145\/1639714.1639717"},{"key":"29_CR20","unstructured":"Mnih, V., et al.: Asynchronous methods for deep reinforcement learning. In: International Conference on Machine Learning, pp. 1928\u20131937. PmLR (2016)"},{"key":"29_CR21","doi-asserted-by":"crossref","unstructured":"Pradel, B., Usunier, N., Gallinari, P.: Ranking with non-random missing ratings: influence of popularity and positivity on evaluation metrics. In: Proceedings of the sixth ACM Conference on Recommender Systems, pp. 147\u2013154 (2012)","DOI":"10.1145\/2365952.2365982"},{"key":"29_CR22","unstructured":"Schnabel, T., Swaminathan, A., Singh, A., Chandak, N., Joachims, T.: Recommendations as treatments: Debiasing learning and evaluation. In: International Conference on Machine Learning, pp. 1670\u20131679. PMLR (2016)"},{"key":"29_CR23","unstructured":"Swaminathan, A., Joachims, T.: Counterfactual risk minimization: learning from logged bandit feedback. In: International conference on machine learning. pp. 814\u2013823. PMLR (2015)"},{"key":"29_CR24","first-page":"7768","volume":"33","author":"Z Wang","year":"2020","unstructured":"Wang, Z., et al.: Critic regularized regression. Adv. Neural. Inf. Process. Syst. 33, 7768\u20137778 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"29_CR25","doi-asserted-by":"crossref","unstructured":"Xin, X., Karatzoglou, A., Arapakis, I., Jose, J.M.: Self-supervised reinforcement learning for recommender systems. In: Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 931\u2013940 (2020)","DOI":"10.1145\/3397271.3401147"},{"key":"29_CR26","doi-asserted-by":"crossref","unstructured":"Yu, C., Lakshmanan, L., Amer-Yahia, S.: It takes variety to make a world: diversification in recommender systems. In: Proceedings of the 12th International Conference on Extending Database Technology: Advances in Database Technology, pp. 368\u2013378 (2009)","DOI":"10.1145\/1516360.1516404"},{"key":"29_CR27","first-page":"28954","volume":"34","author":"T Yu","year":"2021","unstructured":"Yu, T., Kumar, A., Rafailov, R., Rajeswaran, A., Levine, S., Finn, C.: COMBO: conservative offline model-based policy optimization. Adv. Neural. Inf. Process. Syst. 34, 28954\u201328967 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"29_CR28","first-page":"14129","volume":"33","author":"T Yu","year":"2020","unstructured":"Yu, T., et al.: MOPO: model-based offline policy optimization. Adv. Neural. Inf. Process. Syst. 33, 14129\u201314142 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"29_CR29","first-page":"54421","volume":"37","author":"J Zhang","year":"2025","unstructured":"Zhang, J., Fang, L., Shi, K., Wang, W., Jing, B.: Q-distribution guided q-learning for offline reinforcement learning: Uncertainty penalized q-value via consistency model. Adv. Neural. Inf. Process. Syst. 37, 54421\u201354462 (2025)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"29_CR30","first-page":"5616","volume":"36","author":"J Zhang","year":"2023","unstructured":"Zhang, J., Zhang, C., Wang, W., Jing, B.: Constrained policy optimization with explicit behavior density for offline reinforcement learning. Adv. Neural. Inf. Process. Syst. 36, 5616\u20135630 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"29_CR31","unstructured":"Zhang, R., Yu, T., Shen, Y., Jin, H., Chen, C., Carin, L.: Reward constrained interactive recommendation with natural language feedback. arXiv preprint arXiv:2005.01618 (2020)"},{"key":"29_CR32","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Qiu, R., Liu, J., Wang, S.: Roler: effective reward shaping in offline reinforcement learning for recommender systems. In: Proceedings of the 33rd ACM International Conference on Information and Knowledge Management, pp. 3269\u20133278 (2024)","DOI":"10.1145\/3627673.3679633"},{"key":"29_CR33","doi-asserted-by":"crossref","unstructured":"Zhao, X., Xia, L., Zhang, L., Ding, Z., Yin, D., Tang, J.: Deep reinforcement learning for page-wise recommendations. In: Proceedings of the 12th ACM Conference on Recommender Systems, pp. 95\u2013103 (2018)","DOI":"10.1145\/3240323.3240374"},{"key":"29_CR34","doi-asserted-by":"crossref","unstructured":"Zhao, X., Xia, L., Zou, L., Liu, H., Yin, D., Tang, J.: Whole-chain recommendations. In: Proceedings of the 29th ACM International Conference on Information & Knowledge Management, pp. 1883\u20131891 (2020)","DOI":"10.1145\/3340531.3412044"}],"container-title":["Lecture Notes in Computer Science","Machine Learning and Knowledge Discovery in Databases. Research Track"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-06096-9_29","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,26]],"date-time":"2025-09-26T09:55:39Z","timestamp":1758880539000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-06096-9_29"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,27]]},"ISBN":["9783032060952","9783032060969"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-06096-9_29","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9,27]]},"assertion":[{"value":"27 September 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECML PKDD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Joint European Conference on Machine Learning and Knowledge Discovery in Databases","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Porto","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Portugal","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecml2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ecmlpkdd.org\/2025\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}