{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T16:49:19Z","timestamp":1755794959167,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,7,20]],"date-time":"2025-07-20T00:00:00Z","timestamp":1752969600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,20]]},"DOI":"10.1145\/3690624.3709437","type":"proceedings-article","created":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T18:44:43Z","timestamp":1743792283000},"page":"2458-2468","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Session-Level Dynamic Ad Load Optimization using Offline Robust Reinforcement Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7879-5315","authenticated-orcid":false,"given":"Tao","family":"Liu","sequence":"first","affiliation":[{"name":"Meta, Sunnyvale, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3829-6786","authenticated-orcid":false,"given":"Qi","family":"Xu","sequence":"additional","affiliation":[{"name":"Meta, Sunnyvale, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9106-1092","authenticated-orcid":false,"given":"Wei","family":"Shi","sequence":"additional","affiliation":[{"name":"Meta, Sunnyvale, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2045-0701","authenticated-orcid":false,"given":"Zhigang","family":"Hua","sequence":"additional","affiliation":[{"name":"Meta, Sunnyvale, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1607-7446","authenticated-orcid":false,"given":"Shuang","family":"Yang","sequence":"additional","affiliation":[{"name":"Meta, Sunnyvale, CA, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,7,20]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Reinforcement learning: Theory and algorithms. CS Dept","author":"Agarwal Alekh","year":"2019","unstructured":"Alekh Agarwal, Nan Jiang, Sham M Kakade, and Wen Sun. 2019. Reinforcement learning: Theory and algorithms. CS Dept., UW Seattle, Seattle, WA, USA, Tech. Rep, Vol. 32 (2019)."},{"key":"e_1_3_2_2_2_1","volume-title":"Openai gym. arXiv preprint arXiv:1606.01540","author":"Brockman Greg","year":"2016","unstructured":"Greg Brockman, Vicki Cheung, Ludwig Pettersson, Jonas Schneider, John Schulman, Jie Tang, and Wojciech Zaremba. 2016. Openai gym. arXiv preprint arXiv:1606.01540 (2016)."},{"key":"e_1_3_2_2_3_1","unstructured":"Carlos Carrion Zenan Wang Harikesh Nair Xianghong Luo Yulin Lei Xiliang Lin Wenlong Chen Qiyu Hu Changping Peng Yongjun Bao et al. 2021. Blending Advertising with Organic Content in E-Commerce: A Virtual Bids Optimization Approach. arXiv preprint arXiv:2105.13556 (2021)."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3511808.3557103"},{"key":"e_1_3_2_2_5_1","volume-title":"International Conference on Machine Learning. PMLR, 1042--1051","author":"Chen Jinglin","year":"2019","unstructured":"Jinglin Chen and Nan Jiang. 2019. Information-theoretic considerations in batch reinforcement learning. In International Conference on Machine Learning. PMLR, 1042--1051."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"crossref","unstructured":"Victor Chernozhukov Denis Chetverikov Mert Demirer Esther Duflo Christian Hansen Whitney Newey and James Robins. 2018. Double\/debiased machine learning for treatment and structural parameters.","DOI":"10.3386\/w23564"},{"key":"e_1_3_2_2_7_1","volume-title":"Improve User Retention with Causal Learning. In The 2019 ACM SIGKDD Workshop on Causal Discovery. PMLR, 34--49","author":"Du Shuyang","year":"2019","unstructured":"Shuyang Du, James Lee, and Farzin Ghaffarizadeh. 2019. Improve User Retention with Causal Learning. In The 2019 ACM SIGKDD Workshop on Causal Discovery. PMLR, 34--49."},{"key":"e_1_3_2_2_8_1","volume-title":"Endogenous selection bias: The problem of conditioning on a collider variable. Annual review of sociology","author":"Elwert Felix","year":"2014","unstructured":"Felix Elwert and Christopher Winship. 2014. Endogenous selection bias: The problem of conditioning on a collider variable. Annual review of sociology, Vol. 40 (2014), 31--53."},{"key":"e_1_3_2_2_9_1","volume-title":"D4rl: Datasets for deep data-driven reinforcement learning. arXiv preprint arXiv:2004.07219","author":"Fu Justin","year":"2020","unstructured":"Justin Fu, Aviral Kumar, Ofir Nachum, George Tucker, and Sergey Levine. 2020. D4rl: Datasets for deep data-driven reinforcement learning. arXiv preprint arXiv:2004.07219 (2020)."},{"key":"e_1_3_2_2_10_1","volume-title":"International conference on machine learning. PMLR","author":"Fujimoto Scott","year":"2019","unstructured":"Scott Fujimoto, David Meger, and Doina Precup. 2019. Off-policy deep reinforcement learning without exploration. In International conference on machine learning. PMLR, 2052--2062."},{"key":"e_1_3_2_2_11_1","volume-title":"International conference on predictive applications and APIs. PMLR, 1--13","author":"Gutierrez Pierre","year":"2017","unstructured":"Pierre Gutierrez and Jean-Yves G\u00e9rardy. 2017. Causal inference and uplift modelling: A review of the literature. In International conference on predictive applications and APIs. PMLR, 1--13."},{"key":"e_1_3_2_2_12_1","volume-title":"International conference on machine learning. PMLR","author":"Haarnoja Tuomas","year":"2018","unstructured":"Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, and Sergey Levine. 2018. Soft actor-critic: Off-policy maximum entropy deep reinforcement learning with a stochastic actor. In International conference on machine learning. PMLR, 1861--1870."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/2783258.2788583"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1287\/moor.1040.0129"},{"key":"e_1_3_2_2_15_1","volume-title":"International Conference on Machine Learning. PMLR, 652--661","author":"Jiang Nan","year":"2016","unstructured":"Nan Jiang and Lihong Li. 2016. Doubly robust off-policy value evaluation for reinforcement learning. In International Conference on Machine Learning. PMLR, 652--661."},{"key":"e_1_3_2_2_16_1","volume-title":"Morel: Model-based offline reinforcement learning. Advances in neural information processing systems","author":"Kidambi Rahul","year":"2020","unstructured":"Rahul Kidambi, Aravind Rajeswaran, Praneeth Netrapalli, and Thorsten Joachims. 2020. Morel: Model-based offline reinforcement learning. Advances in neural information processing systems, Vol. 33 (2020), 21810--21823."},{"key":"e_1_3_2_2_17_1","volume-title":"Offline reinforcement learning with implicit q-learning. arXiv preprint arXiv:2110.06169","author":"Kostrikov Ilya","year":"2021","unstructured":"Ilya Kostrikov, Ashvin Nair, and Sergey Levine. 2021. Offline reinforcement learning with implicit q-learning. arXiv preprint arXiv:2110.06169 (2021)."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i7.20686"},{"key":"e_1_3_2_2_19_1","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Kumar Aviral","year":"2019","unstructured":"Aviral Kumar, Justin Fu, Matthew Soh, George Tucker, and Sergey Levine. 2019. Stabilizing off-policy q-learning via bootstrapping error reduction. Advances in Neural Information Processing Systems, Vol. 32 (2019)."},{"key":"e_1_3_2_2_20_1","first-page":"1179","article-title":"Conservative q-learning for offline reinforcement learning","volume":"33","author":"Kumar Aviral","year":"2020","unstructured":"Aviral Kumar, Aurick Zhou, George Tucker, and Sergey Levine. 2020. Conservative q-learning for offline reinforcement learning. Advances in Neural Information Processing Systems, Vol. 33 (2020), 1179--1191.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_21_1","volume-title":"Efficient policy iteration for robust markov decision processes via regularization. arXiv preprint arXiv:2205.14327","author":"Kumar Navdeep","year":"2022","unstructured":"Navdeep Kumar, Kfir Levy, Kaixin Wang, and Shie Mannor. 2022. Efficient policy iteration for robust markov decision processes via regularization. arXiv preprint arXiv:2205.14327 (2022)."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.1804597116"},{"key":"e_1_3_2_2_23_1","volume-title":"Offline reinforcement learning: Tutorial, review, and perspectives on open problems. arXiv preprint arXiv:2005.01643","author":"Levine Sergey","year":"2020","unstructured":"Sergey Levine, Aviral Kumar, George Tucker, and Justin Fu. 2020. Offline reinforcement learning: Tutorial, review, and perspectives on open problems. arXiv preprint arXiv:2005.01643 (2020)."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531847"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485447.3512109"},{"key":"e_1_3_2_2_26_1","volume-title":"Continuous control with deep reinforcement learning. arXiv preprint arXiv:1509.02971","author":"Lillicrap Timothy P","year":"2015","unstructured":"Timothy P Lillicrap, Jonathan J Hunt, Alexander Pritzel, Nicolas Heess, Tom Erez, Yuval Tassa, David Silver, and Daan Wierstra. 2015. Continuous control with deep reinforcement learning. arXiv preprint arXiv:1509.02971 (2015)."},{"key":"e_1_3_2_2_27_1","volume-title":"Explicit Feature Interaction-aware Uplift Network for Online Marketing. arXiv preprint arXiv:2306.00315","author":"Liu Dugang","year":"2023","unstructured":"Dugang Liu, Xing Tang, Han Gao, Fuyuan Lyu, and Xiuqiang He. 2023a. Explicit Feature Interaction-aware Uplift Network for Online Marketing. arXiv preprint arXiv:2306.00315 (2023)."},{"key":"e_1_3_2_2_28_1","volume-title":"On the Need for a Language Describing Distribution Shifts: Illustrations on Tabular Datasets. arXiv preprint arXiv:2307.05284","author":"Liu Jiashuo","year":"2023","unstructured":"Jiashuo Liu, Tianyu Wang, Peng Cui, and Hongseok Namkoong. 2023b. On the Need for a Language Describing Distribution Shifts: Illustrations on Tabular Datasets. arXiv preprint arXiv:2307.05284 (2023)."},{"key":"e_1_3_2_2_29_1","first-page":"17183","article-title":"Learning policies with zero or bounded constraint violation for constrained mdps","volume":"34","author":"Liu Tao","year":"2021","unstructured":"Tao Liu, Ruida Zhou, Dileep Kalathil, Panganamala Kumar, and Chao Tian. 2021a. Learning policies with zero or bounded constraint violation for constrained mdps. Advances in Neural Information Processing Systems, Vol. 34 (2021), 17183--17193.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_30_1","volume-title":"Policy Optimization for Constrained MDPs with Provable Fast Global Convergence. arXiv preprint arXiv:2111.00552","author":"Liu Tao","year":"2021","unstructured":"Tao Liu, Ruida Zhou, Dileep Kalathil, PR Kumar, and Chao Tian. 2021b. Policy Optimization for Constrained MDPs with Provable Fast Global Convergence. arXiv preprint arXiv:2111.00552 (2021)."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"crossref","unstructured":"Volodymyr Mnih Koray Kavukcuoglu David Silver Andrei A Rusu Joel Veness Marc G Bellemare Alex Graves Martin Riedmiller Andreas K Fidjeland Georg Ostrovski et al. 2015. Human-level control through deep reinforcement learning. nature Vol. 518 7540 (2015) 529--533.","DOI":"10.1038\/nature14236"},{"key":"e_1_3_2_2_32_1","volume-title":"Integral probability metrics and their generating classes of functions. Advances in applied probability","author":"M\u00fcller Alfred","year":"1997","unstructured":"Alfred M\u00fcller. 1997. Integral probability metrics and their generating classes of functions. Advances in applied probability, Vol. 29, 2 (1997), 429--443."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1093\/biomet\/asaa076"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1287\/opre.1050.0216"},{"key":"e_1_3_2_2_35_1","volume-title":"Robust reinforcement learning using offline data. Advances in neural information processing systems","author":"Panaganti Kishan","year":"2022","unstructured":"Kishan Panaganti, Zaiyan Xu, Dileep Kalathil, and Mohammad Ghavamzadeh. 2022. Robust reinforcement learning using offline data. Advances in neural information processing systems, Vol. 35 (2022), 32211--32224."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1287\/mksc.2022.1423"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1613\/jair.3987"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2010.62"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3616855.3635846"},{"key":"e_1_3_2_2_40_1","volume-title":"International conference on machine learning. PMLR, 3076--3085","author":"Shalit Uri","year":"2017","unstructured":"Uri Shalit, Fredrik D Johansson, and David Sontag. 2017. Estimating individual treatment effect: generalization bounds and algorithms. In International conference on machine learning. PMLR, 3076--3085."},{"key":"e_1_3_2_2_41_1","first-page":"5628","article-title":"Debiased causal tree: heterogeneous treatment effects estimation with unmeasured confounding","volume":"35","author":"Tang Caizhi","year":"2022","unstructured":"Caizhi Tang, Huiyuan Wang, Xinyu Li, Qing Cui, Ya-Lin Zhang, Feng Zhu, Longfei Li, Jun Zhou, and Linbo Jiang. 2022. Debiased causal tree: heterogeneous treatment effects estimation with unmeasured confounding. Advances in Neural Information Processing Systems, Vol. 35 (2022), 5628--5640.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_42_1","first-page":"7193","article-title":"Online robust reinforcement learning with model uncertainty","volume":"34","author":"Wang Yue","year":"2021","unstructured":"Yue Wang and Shaofeng Zou. 2021. Online robust reinforcement learning with model uncertainty. Advances in Neural Information Processing Systems, Vol. 34 (2021), 7193--7206.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3511808.3557611"},{"key":"e_1_3_2_2_44_1","volume-title":"International conference on machine learning. PMLR","author":"Wang Ziyu","year":"2016","unstructured":"Ziyu Wang, Tom Schaul, Matteo Hessel, Hado Hasselt, Marc Lanctot, and Nando Freitas. 2016. Dueling network architectures for deep reinforcement learning. In International conference on machine learning. PMLR, 1995--2003."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3580305.3599809"},{"key":"e_1_3_2_2_46_1","volume-title":"Bellman-consistent pessimism for offline reinforcement learning. Advances in neural information processing systems","author":"Xie Tengyang","year":"2021","unstructured":"Tengyang Xie, Ching-An Cheng, Nan Jiang, Paul Mineiro, and Alekh Agarwal. 2021. Bellman-consistent pessimism for offline reinforcement learning. Advances in neural information processing systems, Vol. 34 (2021), 6683--6694."},{"key":"e_1_3_2_2_47_1","volume-title":"ResAct: Reinforcing long-term engagement in sequential recommendation with residual actor. arXiv preprint arXiv:2206.02620","author":"Xue Wanqi","year":"2022","unstructured":"Wanqi Xue, Qingpeng Cai, Ruohan Zhan, Dong Zheng, Peng Jiang, Kun Gai, and Bo An. 2022. ResAct: Reinforcing long-term engagement in sequential recommendation with residual actor. arXiv preprint arXiv:2206.02620 (2022)."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403391"},{"key":"e_1_3_2_2_49_1","first-page":"14129","article-title":"Mopo: Model-based offline policy optimization","volume":"33","author":"Yu Tianhe","year":"2020","unstructured":"Tianhe Yu, Garrett Thomas, Lantao Yu, Stefano Ermon, James Y Zou, Sergey Levine, Chelsea Finn, and Tengyu Ma. 2020. Mopo: Model-based offline policy optimization. Advances in Neural Information Processing Systems, Vol. 33 (2020), 14129--14142.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_50_1","volume-title":"Natural Actor-Critic for Robust Reinforcement Learning with Function Approximation. In Thirty-seventh Conference on Neural Information Processing Systems.","author":"Zhou Ruida","year":"2023","unstructured":"Ruida Zhou, Tao Liu, Min Cheng, Dileep Kalathil, Panganamala Kumar, and Chao Tian. 2023. Natural Actor-Critic for Robust Reinforcement Learning with Function Approximation. In Thirty-seventh Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_2_51_1","first-page":"13584","article-title":"Anchor-changing regularized natural policy gradient for multi-objective reinforcement learning","volume":"35","author":"Zhou Ruida","year":"2022","unstructured":"Ruida Zhou, Tao Liu, Dileep Kalathil, PR Kumar, and Chao Tian. 2022. Anchor-changing regularized natural policy gradient for multi-objective reinforcement learning. Advances in Neural Information Processing Systems, Vol. 35 (2022), 13584--13596. n","journal-title":"Advances in Neural Information Processing Systems"}],"event":{"name":"KDD '25: The 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"],"location":"Toronto ON Canada","acronym":"KDD '25"},"container-title":["Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3690624.3709437","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3690624.3709437","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,16]],"date-time":"2025-08-16T15:44:10Z","timestamp":1755359050000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3690624.3709437"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,20]]},"references-count":51,"alternative-id":["10.1145\/3690624.3709437","10.1145\/3690624"],"URL":"https:\/\/doi.org\/10.1145\/3690624.3709437","relation":{},"subject":[],"published":{"date-parts":[[2025,7,20]]},"assertion":[{"value":"2025-07-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}