{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,28]],"date-time":"2026-01-28T20:38:15Z","timestamp":1769632695250,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,7,25]],"date-time":"2020-07-25T00:00:00Z","timestamp":1595635200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"NSFC","award":["61702327"],"award-info":[{"award-number":["61702327"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,7,25]]},"DOI":"10.1145\/3397271.3401467","type":"proceedings-article","created":{"date-parts":[[2020,7,25]],"date-time":"2020-07-25T07:50:08Z","timestamp":1595663408000},"page":"2468-2471","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":42,"title":["Deep Reinforcement Learning for Information Retrieval: Fundamentals and Advances"],"prefix":"10.1145","author":[{"given":"Weinan","family":"Zhang","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"given":"Xiangyu","family":"Zhao","sequence":"additional","affiliation":[{"name":"Michigan State University, Lansing, MI, USA"}]},{"given":"Li","family":"Zhao","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"given":"Dawei","family":"Yin","sequence":"additional","affiliation":[{"name":"Baidu, Sunnyvale, CA, USA"}]},{"given":"Grace Hui","family":"Yang","sequence":"additional","affiliation":[{"name":"Georgetown University, Washington DC, DC, USA"}]},{"given":"Alex","family":"Beutel","sequence":"additional","affiliation":[{"name":"Google, Mountain View, CA, USA"}]}],"member":"320","published-online":{"date-parts":[[2020,7,25]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"2002 a. Finite-time analysis of the multiarmed bandit problem. Machine learning","author":"Auer Peter","year":"2002","unstructured":"Peter Auer , Nicolo Cesa-Bianchi , and Paul Fischer . 2002 a. Finite-time analysis of the multiarmed bandit problem. Machine learning , Vol. 47 , 2--3 ( 2002 ), 235--256. Peter Auer, Nicolo Cesa-Bianchi, and Paul Fischer. 2002 a. Finite-time analysis of the multiarmed bandit problem. Machine learning, Vol. 47, 2--3 (2002), 235--256."},{"key":"e_1_3_2_2_2_1","series-title":"SIAM journal on computing","volume-title":"2002 b. The nonstochastic multiarmed bandit problem","author":"Auer Peter","year":"2002","unstructured":"Peter Auer , Nicolo Cesa-Bianchi , Yoav Freund , and Robert E Schapire . 2002 b. The nonstochastic multiarmed bandit problem . SIAM journal on computing , Vol. 32 , 1 ( 2002 ), 48--77. Peter Auer, Nicolo Cesa-Bianchi, Yoav Freund, and Robert E Schapire. 2002 b. The nonstochastic multiarmed bandit problem. SIAM journal on computing, Vol. 32, 1 (2002), 48--77."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"crossref","unstructured":"Han Cai Kan Ren Weinan Zhang Kleanthis Malialis Jun Wang Yong Yu and Defeng Guo. 2017. Real-time bidding by reinforcement learning in display advertising. In WSDM. 661--670. Han Cai Kan Ren Weinan Zhang Kleanthis Malialis Jun Wang Yong Yu and Defeng Guo. 2017. Real-time bidding by reinforcement learning in display advertising. In WSDM. 661--670.","DOI":"10.1145\/3018661.3018702"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2006.152"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33013312"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"crossref","unstructured":"Minmin Chen Alex Beutel Paul Covington Sagar Jain Francois Belletti and Ed H Chi. 2019 a. Top-K Off-Policy Correction for a REINFORCE Recommender System. In WSDM. ACM 456--464. Minmin Chen Alex Beutel Paul Covington Sagar Jain Francois Belletti and Ed H Chi. 2019 a. Top-K Off-Policy Correction for a REINFORCE Recommender System. In WSDM. ACM 456--464.","DOI":"10.1145\/3289600.3290999"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/1924475.1924485"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/1860702.1860709"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12063"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2018396.2018423"},{"key":"e_1_3_2_2_11_1","volume-title":"Optimizing Sponsored Search Ranking Strategy by Deep Reinforcement Learning. arXiv preprint arXiv:1803.07347","author":"He Li","year":"2018","unstructured":"Li He , Liang Wang , Kaipeng Liu , Bo Wu , and Weinan Zhang . 2018. Optimizing Sponsored Search Ranking Strategy by Deep Reinforcement Learning. arXiv preprint arXiv:1803.07347 ( 2018 ). Li He, Liang Wang, Kaipeng Liu, Bo Wu, and Weinan Zhang. 2018. Optimizing Sponsored Search Ranking Strategy by Deep Reinforcement Learning. arXiv preprint arXiv:1803.07347 (2018)."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"crossref","unstructured":"Junqi Jin Chengru Song Han Li Kun Gai Jun Wang and Weinan Zhang. 2018. Real-time bidding with multi-agent reinforcement learning in display advertising. In CIKM. 2193--2201. Junqi Jin Chengru Song Han Li Kun Gai Jun Wang and Weinan Zhang. 2018. Real-time bidding with multi-agent reinforcement learning in display advertising. In CIKM. 2193--2201.","DOI":"10.1145\/3269206.3272021"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.5555\/1622737.1622748"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1177\/0278364915619772"},{"key":"e_1_3_2_2_15_1","volume-title":"Deep reinforcement learning based recommendation with explicit user-item interactions modeling. arXiv preprint arXiv:1810.12027","author":"Liu Feng","year":"2018","unstructured":"Feng Liu , Ruiming Tang , Xutao Li , Weinan Zhang , Yunming Ye , Haokun Chen , Huifeng Guo , and Yuzhou Zhang . 2018. Deep reinforcement learning based recommendation with explicit user-item interactions modeling. arXiv preprint arXiv:1810.12027 ( 2018 ). Feng Liu, Ruiming Tang, Xutao Li, Weinan Zhang, Yunming Ye, Haokun Chen, Huifeng Guo, and Yuzhou Zhang. 2018. Deep reinforcement learning based recommendation with explicit user-item interactions modeling. arXiv preprint arXiv:1810.12027 (2018)."},{"key":"e_1_3_2_2_16_1","volume-title":"Mehdi Mirza, Alex Graves, Timothy Lillicrap, Tim Harley, David Silver, and Koray Kavukcuoglu.","author":"Mnih Volodymyr","year":"2016","unstructured":"Volodymyr Mnih , Adria Puigdomenech Badia , Mehdi Mirza, Alex Graves, Timothy Lillicrap, Tim Harley, David Silver, and Koray Kavukcuoglu. 2016 . Asynchronous methods for deep reinforcement learning. In ICML. 1928--1937. Volodymyr Mnih, Adria Puigdomenech Badia, Mehdi Mirza, Alex Graves, Timothy Lillicrap, Tim Harley, David Silver, and Koray Kavukcuoglu. 2016. Asynchronous methods for deep reinforcement learning. In ICML. 1928--1937."},{"key":"e_1_3_2_2_17_1","volume-title":"Playing atari with deep reinforcement learning. arXiv preprint arXiv:1312.5602","author":"Mnih Volodymyr","year":"2013","unstructured":"Volodymyr Mnih , Koray Kavukcuoglu , David Silver , Alex Graves , Ioannis Antonoglou , Daan Wierstra , and Martin Riedmiller . 2013. Playing atari with deep reinforcement learning. arXiv preprint arXiv:1312.5602 ( 2013 ). Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Alex Graves, Ioannis Antonoglou, Daan Wierstra, and Martin Riedmiller. 2013. Playing atari with deep reinforcement learning. arXiv preprint arXiv:1312.5602 (2013)."},{"key":"e_1_3_2_2_18_1","volume-title":"Learning to coordinate multiple reinforcement learning agents for diverse query reformulation. arXiv preprint arXiv:1809.10658","author":"Nogueira Rodrigo","year":"2018","unstructured":"Rodrigo Nogueira , Jannis Bulian , and Massimiliano Ciaramita . 2018. Learning to coordinate multiple reinforcement learning agents for diverse query reformulation. arXiv preprint arXiv:1809.10658 ( 2018 ). Rodrigo Nogueira, Jannis Bulian, and Massimiliano Ciaramita. 2018. Learning to coordinate multiple reinforcement learning agents for diverse query reformulation. arXiv preprint arXiv:1809.10658 (2018)."},{"key":"e_1_3_2_2_19_1","volume-title":"Task-oriented query reformulation with reinforcement learning. arXiv preprint arXiv:1704.04572","author":"Nogueira Rodrigo","year":"2017","unstructured":"Rodrigo Nogueira and Kyunghyun Cho . 2017. Task-oriented query reformulation with reinforcement learning. arXiv preprint arXiv:1704.04572 ( 2017 ). Rodrigo Nogueira and Kyunghyun Cho. 2017. Task-oriented query reformulation with reinforcement learning. arXiv preprint arXiv:1704.04572 (2017)."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"crossref","unstructured":"Alessandro Nuara Francesco Trovo Nicola Gatti and Marcello Restelli. 2018. A combinatorial-bandit algorithm for the online joint bid\/budget optimization of pay-per-click advertising campaigns. In AAAI. Alessandro Nuara Francesco Trovo Nicola Gatti and Marcello Restelli. 2018. A combinatorial-bandit algorithm for the online joint bid\/budget optimization of pay-per-click advertising campaigns. In AAAI.","DOI":"10.1609\/aaai.v32i1.11888"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3233770"},{"key":"e_1_3_2_2_22_1","unstructured":"Razieh Rahimi and Grace Hui Yang. [n. d.]. Modeling Exploration of Intrinsically Diverse Search Tasks as Markov Decision Processes. ([n. d.]). Razieh Rahimi and Grace Hui Yang. [n. d.]. Modeling Exploration of Intrinsically Diverse Search Tasks as Markov Decision Processes. ([n. d.])."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"crossref","unstructured":"Konstantin Salomatin Tie-Yan Liu and Yiming Yang. 2012. A unified optimization framework for auction and guaranteed delivery in online advertising. In CIKM. 2005--2009. Konstantin Salomatin Tie-Yan Liu and Yiming Yang. 2012. A unified optimization framework for auction and guaranteed delivery in online advertising. In CIKM. 2005--2009.","DOI":"10.1145\/2396761.2398561"},{"key":"e_1_3_2_2_24_1","volume-title":"JMLR","volume":"6","author":"Shani Guy","year":"2005","unstructured":"Guy Shani , David Heckerman , and Ronen I Brafman . 2005 . An MDP-based recommender system . JMLR , Vol. 6 , Sep (2005), 1265--1295. Guy Shani, David Heckerman, and Ronen I Brafman. 2005. An MDP-based recommender system. JMLR, Vol. 6, Sep (2005), 1265--1295."},{"key":"e_1_3_2_2_25_1","volume-title":"Julian Schrittwieser, Ioannis Antonoglou, Veda Panneershelvam, Marc Lanctot, et al.","author":"Silver David","year":"2016","unstructured":"David Silver , Aja Huang , Chris J Maddison , Arthur Guez , Laurent Sifre , George Van Den Driessche , Julian Schrittwieser, Ioannis Antonoglou, Veda Panneershelvam, Marc Lanctot, et al. 2016 . Mastering the game of Go with deep neural networks and tree search. nature, Vol. 529 , 7587 (2016), 484. David Silver, Aja Huang, Chris J Maddison, Arthur Guez, Laurent Sifre, George Van Den Driessche, Julian Schrittwieser, Ioannis Antonoglou, Veda Panneershelvam, Marc Lanctot, et al. 2016. Mastering the game of Go with deep neural networks and tree search. nature, Vol. 529, 7587 (2016), 484."},{"key":"e_1_3_2_2_26_1","volume-title":"Nature","volume":"550","author":"Silver David","year":"2017","unstructured":"David Silver , Julian Schrittwieser , Karen Simonyan , Ioannis Antonoglou , Aja Huang , Arthur Guez , Thomas Hubert , Lucas Baker , Matthew Lai , Adrian Bolton , 2017 . Mastering the game of Go without human knowledge . Nature , Vol. 550 , 7676 (2017), 354. David Silver, Julian Schrittwieser, Karen Simonyan, Ioannis Antonoglou, Aja Huang, Arthur Guez, Thomas Hubert, Lucas Baker, Matthew Lai, Adrian Bolton, et al. 2017. Mastering the game of Go without human knowledge. Nature, Vol. 550, 7676 (2017), 354."},{"key":"e_1_3_2_2_27_1","volume-title":"Reinforcement learning: An introduction","author":"Sutton Richard S","unstructured":"Richard S Sutton and Andrew G Barto . 2018. Reinforcement learning: An introduction . MIT press . Richard S Sutton and Andrew G Barto. 2018. Reinforcement learning: An introduction .MIT press."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"crossref","unstructured":"Liang Tang Romer Rosales Ajit Singh and Deepak Agarwal. 2013. Automatic ad format selection via contextual bandits. In CIKM. 1587--1594. Liang Tang Romer Rosales Ajit Singh and Deepak Agarwal. 2013. Automatic ad format selection via contextual bandits. In CIKM. 1587--1594.","DOI":"10.1145\/2505515.2514700"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"crossref","unstructured":"Zhiwen Tang and Grace Hui Yang. 2017. A Reinforcement Learning Approach for Dynamic Search. In TREC. Zhiwen Tang and Grace Hui Yang. 2017. A Reinforcement Learning Approach for Dynamic Search. In TREC.","DOI":"10.6028\/NIST.SP.500-324.domain-georgetown"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"crossref","unstructured":"Zeng Wei Jun Xu Yanyan Lan Jiafeng Guo and Xueqi Cheng. 2017. Reinforcement learning to rank with Markov decision process. In SIGIR. 945--948. Zeng Wei Jun Xu Yanyan Lan Jiafeng Guo and Xueqi Cheng. 2017. Reinforcement learning to rank with Markov decision process. In SIGIR. 945--948.","DOI":"10.1145\/3077136.3080685"},{"key":"e_1_3_2_2_32_1","volume-title":"A multi-agent reinforcement learning method for impression allocation in online display advertising. arXiv preprint arXiv:1809.03152","author":"Wu Di","year":"2018","unstructured":"Di Wu , Cheng Chen , Xun Yang , Xiujun Chen , Qing Tan , Jian Xu , and Kun Gai . 2018a. A multi-agent reinforcement learning method for impression allocation in online display advertising. arXiv preprint arXiv:1809.03152 ( 2018 ). Di Wu, Cheng Chen, Xun Yang, Xiujun Chen, Qing Tan, Jian Xu, and Kun Gai. 2018a. A multi-agent reinforcement learning method for impression allocation in online display advertising. arXiv preprint arXiv:1809.03152 (2018)."},{"key":"e_1_3_2_2_33_1","unstructured":"Qingyun Wu Naveen Iyer and Hongning Wang. 2018b. Learning contextual bandits in a non-stationary environment. In SIGIR. 495--504. Qingyun Wu Naveen Iyer and Hongning Wang. 2018b. Learning contextual bandits in a non-stationary environment. In SIGIR. 495--504."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"crossref","unstructured":"Long Xia Jun Xu Yanyan Lan Jiafeng Guo Wei Zeng and Xueqi Cheng. 2017. Adapting Markov decision process for search result diversification. In SIGIR. 535--544. Long Xia Jun Xu Yanyan Lan Jiafeng Guo Wei Zeng and Xueqi Cheng. 2017. Adapting Markov decision process for search result diversification. In SIGIR. 535--544.","DOI":"10.1145\/3077136.3080775"},{"key":"e_1_3_2_2_35_1","unstructured":"Min Xu Tao Qin and Tie-Yan Liu. 2013. Estimation bias in multi-armed bandit algorithms for search advertising. In NIPS. 2400--2408. Min Xu Tao Qin and Tie-Yan Liu. 2013. Estimation bias in multi-armed bandit algorithms for search advertising. In NIPS. 2400--2408."},{"key":"e_1_3_2_2_36_1","volume-title":"Dynamic contextual multi arm bandits in display advertisement","author":"Yang Hongxia","unstructured":"Hongxia Yang and Quan Lu. 2016. Dynamic contextual multi arm bandits in display advertisement . In ICDM. IEEE , 1305--1310. Hongxia Yang and Quan Lu. 2016. Dynamic contextual multi arm bandits in display advertisement. In ICDM. IEEE, 1305--1310."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"crossref","unstructured":"Chunqiu Zeng Qing Wang Shekoofeh Mokhtari and Tao Li. 2016. Online context-aware recommendation with time varying multi-armed bandit. In KDD. 2025--2034. Chunqiu Zeng Qing Wang Shekoofeh Mokhtari and Tao Li. 2016. Online context-aware recommendation with time varying multi-armed bandit. In KDD. 2025--2034.","DOI":"10.1145\/2939672.2939878"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240323.3240374"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"crossref","unstructured":"Xiangyu Zhao Liang Zhang Zhuoye Ding Long Xia Jiliang Tang and Dawei Yin. 2018b. Recommendations with Negative Feedback via Pairwise Deep Reinforcement Learning. In KDD. ACM 1040--1048. Xiangyu Zhao Liang Zhang Zhuoye Ding Long Xia Jiliang Tang and Dawei Yin. 2018b. Recommendations with Negative Feedback via Pairwise Deep Reinforcement Learning. In KDD. ACM 1040--1048.","DOI":"10.1145\/3219819.3219886"},{"key":"e_1_3_2_2_40_1","volume-title":"Deep Reinforcement Learning for List-wise Recommendations. arXiv preprint arXiv:1801.00209","author":"Zhao Xiangyu","year":"2017","unstructured":"Xiangyu Zhao , Liang Zhang , Zhuoye Ding , Dawei Yin , Yihong Zhao , and Jiliang Tang . 2017. Deep Reinforcement Learning for List-wise Recommendations. arXiv preprint arXiv:1801.00209 ( 2017 ). Xiangyu Zhao, Liang Zhang, Zhuoye Ding, Dawei Yin, Yihong Zhao, and Jiliang Tang. 2017. Deep Reinforcement Learning for List-wise Recommendations. arXiv preprint arXiv:1801.00209 (2017)."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"crossref","unstructured":"Xiaoxue Zhao Weinan Zhang and Jun Wang. 2013. Interactive collaborative filtering. In CIKM. 1411--1420. Xiaoxue Zhao Weinan Zhang and Jun Wang. 2013. Interactive collaborative filtering. In CIKM. 1411--1420.","DOI":"10.1145\/2505515.2505690"},{"key":"e_1_3_2_2_42_1","volume-title":"Jointly Learning to Recommend and Advertise. arXiv preprint arXiv:2003.00097","author":"Zhao Xiangyu","year":"2020","unstructured":"Xiangyu Zhao , Xudong Zheng , Xiwang Yang , Xiaobing Liu , and Jiliang Tang . 2020. Jointly Learning to Recommend and Advertise. arXiv preprint arXiv:2003.00097 ( 2020 ). Xiangyu Zhao, Xudong Zheng, Xiwang Yang, Xiaobing Liu, and Jiliang Tang. 2020. Jointly Learning to Recommend and Advertise. arXiv preprint arXiv:2003.00097 (2020)."}],"event":{"name":"SIGIR '20: The 43rd International ACM SIGIR conference on research and development in Information Retrieval","location":"Virtual Event China","acronym":"SIGIR '20","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3397271.3401467","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3397271.3401467","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:12:47Z","timestamp":1750201967000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3397271.3401467"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,7,25]]},"references-count":41,"alternative-id":["10.1145\/3397271.3401467","10.1145\/3397271"],"URL":"https:\/\/doi.org\/10.1145\/3397271.3401467","relation":{},"subject":[],"published":{"date-parts":[[2020,7,25]]},"assertion":[{"value":"2020-07-25","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}