{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,11]],"date-time":"2026-04-11T13:11:50Z","timestamp":1775913110328,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,4,30]],"date-time":"2023-04-30T00:00:00Z","timestamp":1682812800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,4,30]]},"DOI":"10.1145\/3543507.3583259","type":"proceedings-article","created":{"date-parts":[[2023,4,26]],"date-time":"2023-04-26T23:30:51Z","timestamp":1682551851000},"page":"865-875","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":40,"title":["Two-Stage Constrained Actor-Critic for Short Video Recommendation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6451-9299","authenticated-orcid":false,"given":"Qingpeng","family":"Cai","sequence":"first","affiliation":[{"name":"Kuaishou Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9340-0366","authenticated-orcid":false,"given":"Zhenghai","family":"Xue","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0150-6500","authenticated-orcid":false,"given":"Chi","family":"Zhang","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3490-1088","authenticated-orcid":false,"given":"Wanqi","family":"Xue","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1440-911X","authenticated-orcid":false,"given":"Shuchang","family":"Liu","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3426-2784","authenticated-orcid":false,"given":"Ruohan","family":"Zhan","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5935-3757","authenticated-orcid":false,"given":"Xueliang","family":"Wang","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2733-1235","authenticated-orcid":false,"given":"Tianyou","family":"Zuo","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6733-2291","authenticated-orcid":false,"given":"Wentao","family":"Xie","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0424-9658","authenticated-orcid":false,"given":"Dong","family":"Zheng","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9266-0780","authenticated-orcid":false,"given":"Peng","family":"Jiang","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3636-3618","authenticated-orcid":false,"given":"Kun","family":"Gai","sequence":"additional","affiliation":[{"name":"Unaffiliated, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,4,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Reinforcement learning based recommender systems: A survey. arXiv preprint arXiv:2101.06286","author":"Afsar M\u00a0Mehdi","year":"2021","unstructured":"M\u00a0Mehdi Afsar, Trafford Crump, and Behrouz Far. 2021. Reinforcement learning based recommender systems: A survey. arXiv preprint arXiv:2101.06286 (2021)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2016.01.013"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33013312"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3289600.3290999"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3220122"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442381.3449846"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2988450.2988454"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.5555\/3122009.3242024"},{"key":"e_1_3_2_1_9_1","volume-title":"Safe exploration in continuous action spaces. arXiv preprint arXiv:1801.08757","author":"Dalal Gal","year":"2018","unstructured":"Gal Dalal, Krishnamurthy Dvijotham, Matej Vecerik, Todd Hester, Cosmin Paduraru, and Yuval Tassa. 2018. Safe exploration in continuous action spaces. arXiv preprint arXiv:1801.08757 (2018)."},{"key":"e_1_3_2_1_10_1","volume-title":"Deep reinforcement learning in large discrete action spaces. arXiv preprint arXiv:1512.07679","author":"Dulac-Arnold Gabriel","year":"2015","unstructured":"Gabriel Dulac-Arnold, Richard Evans, Hado van Hasselt, Peter Sunehag, Timothy Lillicrap, Jonathan Hunt, Timothy Mann, Theophane Weber, Thomas Degris, and Ben Coppin. 2015. Deep reinforcement learning in large discrete action spaces. arXiv preprint arXiv:1512.07679 (2015)."},{"key":"e_1_3_2_1_11_1","volume-title":"CIRS: Bursting Filter Bubbles by Counterfactual Interactive Recommender System. arXiv preprint arXiv:2204.01266","author":"Gao Chongming","year":"2022","unstructured":"Chongming Gao, Wenqiang Lei, Jiawei Chen, Shiqi Wang, Xiangnan He, Shijun Li, Biao Li, Yuan Zhang, and Peng Jiang. 2022. CIRS: Bursting Filter Bubbles by Counterfactual Interactive Recommender System. arXiv preprint arXiv:2204.01266 (2022)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3511808.3557624"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.5555\/2789272.2886795"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437963.3441824"},{"key":"e_1_3_2_1_15_1","volume-title":"Toward Pareto Efficient Fairness-Utility Trade-off inRecommendation through Reinforcement Learning. arXiv preprint arXiv:2201.00140","author":"Ge Yingqiang","year":"2022","unstructured":"Yingqiang Ge, Xiaoting Zhao, Lucia Yu, Saurabh Paul, Diane Hu, Chu-Cheng Hsieh, and Yongfeng Zhang. 2022. Toward Pareto Efficient Fairness-Utility Trade-off inRecommendation through Reinforcement Learning. arXiv preprint arXiv:2201.00140 (2022)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3511808.3557065"},{"key":"e_1_3_2_1_17_1","unstructured":"Huifeng Guo Ruiming Tang Yunming Ye Zhenguo Li and Xiuqiang He. 2017. DeepFM: a factorization-machine based neural network for CTR prediction. arXiv preprint arXiv:1703.04247 (2017)."},{"key":"e_1_3_2_1_18_1","volume-title":"Continuous control with deep reinforcement learning. arXiv preprint arXiv:1509.02971","author":"Lillicrap P","year":"2015","unstructured":"Timothy\u00a0P Lillicrap, Jonathan\u00a0J Hunt, Alexander Pritzel, Nicolas Heess, Tom Erez, Yuval Tassa, David Silver, and Daan Wierstra. 2015. Continuous control with deep reinforcement learning. arXiv preprint arXiv:1509.02971 (2015)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3298689.3346998"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539130"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2925019"},{"key":"e_1_3_2_1_22_1","volume-title":"Learning to rank for information retrieval. Foundations and Trends\u00ae in Information Retrieval 3, 3","author":"Tie-Yan","year":"2009","unstructured":"Tie-Yan Liu 2009. Learning to rank for information retrieval. Foundations and Trends\u00ae in Information Retrieval 3, 3 (2009), 225\u2013331."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/614"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3366423.3380130"},{"key":"e_1_3_2_1_25_1","volume-title":"Multi-objective deep reinforcement learning. arXiv preprint arXiv:1610.02707","author":"Mossalam Hossam","year":"2016","unstructured":"Hossam Mossalam, Yannis\u00a0M Assael, Diederik\u00a0M Roijers, and Shimon Whiteson. 2016. Multi-objective deep reinforcement learning. arXiv preprint arXiv:1610.02707 (2016)."},{"key":"e_1_3_2_1_26_1","volume-title":"AWAC: Accelerating Online Reinforcement Learning with Offline Datasets.","author":"Nair Ashvin","year":"2020","unstructured":"Ashvin Nair, Murtaza Dalal, Abhishek Gupta, and Sergey Levine. 2020. AWAC: Accelerating Online Reinforcement Learning with Offline Datasets. (2020)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/EMBC.2016.7591355"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2020.103915"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3340531.3412744"},{"key":"e_1_3_2_1_30_1","volume-title":"Eligibility traces for off-policy policy evaluation","author":"Precup Doina","year":"2000","unstructured":"Doina Precup. 2000. Eligibility traces for off-policy policy evaluation. Computer Science Department Faculty Publication Series (2000), 80."},{"key":"e_1_3_2_1_31_1","unstructured":"Doina Precup Richard\u00a0S Sutton and Sanjoy Dasgupta. 2001. Off-policy temporal-difference learning with function approximation. In ICML. 417\u2013424."},{"key":"e_1_3_2_1_32_1","volume-title":"Multi-task learning as multi-objective optimization. arXiv preprint arXiv:1810.04650","author":"Sener Ozan","year":"2018","unstructured":"Ozan Sener and Vladlen Koltun. 2018. Multi-task learning as multi-objective optimization. arXiv preprint arXiv:1810.04650 (2018)."},{"key":"e_1_3_2_1_33_1","volume-title":"Choosing the Best of Both Worlds: Diverse and Novel Recommendations through Multi-Objective Reinforcement Learning. arXiv preprint arXiv:2110.15097","author":"Stamenkovic Dusan","year":"2021","unstructured":"Dusan Stamenkovic, Alexandros Karatzoglou, Ioannis Arapakis, Xin Xin, and Kleomenis Katevas. 2021. Choosing the Best of Both Worlds: Diverse and Novel Recommendations through Multi-Objective Reinforcement Learning. arXiv preprint arXiv:2110.15097 (2021)."},{"key":"e_1_3_2_1_34_1","volume-title":"Reinforcement learning: An introduction","author":"Sutton S","unstructured":"Richard\u00a0S Sutton and Andrew\u00a0G Barto. 2018. Reinforcement learning: An introduction. MIT press."},{"key":"e_1_3_2_1_35_1","volume-title":"Reward constrained policy optimization. arXiv preprint arXiv:1805.11074","author":"Tessler Chen","year":"2018","unstructured":"Chen Tessler, Daniel\u00a0J Mankowitz, and Shie Mannor. 2018. Reward constrained policy optimization. arXiv preprint arXiv:1805.11074 (2018)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539354"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539073"},{"key":"e_1_3_2_1_38_1","unstructured":"C\u00a0Ch White CC\u00a0III WHITE and KIM KW. 1980. Solution procedures for vector criterion Markov decision processes. (1980)."},{"key":"e_1_3_2_1_39_1","volume-title":"Simple statistical gradient-following algorithms for connectionist reinforcement learning. Reinforcement learning","author":"Williams J","year":"1992","unstructured":"Ronald\u00a0J Williams. 1992. Simple statistical gradient-following algorithms for connectionist reinforcement learning. Reinforcement learning (1992), 5\u201332."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3331184.3331203"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3488560.3498494"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539092"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3219886"},{"key":"e_1_3_2_1_44_1","volume-title":"Deep reinforcement learning for list-wise recommendations. arXiv preprint arXiv:1801.00209","author":"Zhao Xiangyu","year":"2017","unstructured":"Xiangyu Zhao, Liang Zhang, Long Xia, Zhuoye Ding, Dawei Yin, and Jiliang Tang. 2017. Deep reinforcement learning for list-wise recommendations. arXiv preprint arXiv:1801.00209 (2017)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330668"}],"event":{"name":"WWW '23: The ACM Web Conference 2023","location":"Austin TX USA","acronym":"WWW '23","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM Web Conference 2023"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3543507.3583259","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3543507.3583259","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:37:22Z","timestamp":1750178242000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3543507.3583259"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,4,30]]},"references-count":45,"alternative-id":["10.1145\/3543507.3583259","10.1145\/3543507"],"URL":"https:\/\/doi.org\/10.1145\/3543507.3583259","relation":{},"subject":[],"published":{"date-parts":[[2023,4,30]]},"assertion":[{"value":"2023-04-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}