{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T21:22:00Z","timestamp":1775078520486,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":68,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,13]]},"DOI":"10.1145\/3726302.3730275","type":"proceedings-article","created":{"date-parts":[[2025,7,14]],"date-time":"2025-07-14T01:21:38Z","timestamp":1752456098000},"page":"3712-3722","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Researchy Questions: A Dataset of Multi-Perspective, Decompositional Questions for Deep Research"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9167-6214","authenticated-orcid":false,"given":"Corbin","family":"Rosset","sequence":"first","affiliation":[{"name":"Microsoft, Redmond, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2490-4712","authenticated-orcid":false,"given":"Ho-Lam","family":"Chung","sequence":"additional","affiliation":[{"name":"National Taiwan University, Taipei, Taiwan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3009-8614","authenticated-orcid":false,"given":"Guanghui","family":"Qin","sequence":"additional","affiliation":[{"name":"Johns Hopkins University, Baltimore, MD, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8851-6701","authenticated-orcid":false,"given":"Ethan","family":"Chau","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3061-0551","authenticated-orcid":false,"given":"Zhuo","family":"Feng","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6426-3537","authenticated-orcid":false,"given":"Ahmed","family":"Awadallah","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1157-018X","authenticated-orcid":false,"given":"Jennifer","family":"Neville","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0281-932X","authenticated-orcid":false,"given":"Nikhil","family":"Rao","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, WA, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,7,13]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Reinald Kim Amplayo KellieWebster Michael Collins Dipanjan Das and Shashi Narayan. 2022. Query Refinement Prompts for Closed-Book Long-Form Question Answering. arXiv:2210.17525 [cs.CL]","DOI":"10.18653\/v1\/2023.acl-long.444"},{"key":"e_1_3_2_1_2_1","volume-title":"Complex and Exploratory Web Search. (01","author":"Aula Anne","year":"2008","unstructured":"Anne Aula and Daniel Russell. 2008. Complex and Exploratory Web Search. (01 2008)."},{"key":"e_1_3_2_1_3_1","volume-title":"MS MARCO: A Human Generated MAchine Reading COmprehension Dataset. arXiv:1611.09268 [cs.CL]","author":"Bajaj Payal","year":"2018","unstructured":"Payal Bajaj, Daniel Campos, Nick Craswell, Li Deng, Jianfeng Gao, Xiaodong Liu, Rangan Majumder, Andrew McNamara, Bhaskar Mitra, Tri Nguyen, Mir Rosenberg, Xia Song, Alina Stoica, Saurabh Tiwary, and Tong Wang. 2018. MS MARCO: A Human Generated MAchine Reading COmprehension Dataset. arXiv:1611.09268 [cs.CL]"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D13-1160"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531926"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.290"},{"key":"e_1_3_2_1_7_1","unstructured":"Sebastian Borgeaud Arthur Mensch Jordan Hoffmann Trevor Cai Eliza Rutherford Katie Millican George van den Driessche Jean-Baptiste Lespiau Bogdan Damoc Aidan Clark Diego de Las Casas Aurelia Guy Jacob Menick Roman Ring Tom Hennigan Saffron Huang Loren Maggiore Chris Jones Albin Cassirer Andy Brock Michela Paganini Geoffrey Irving Oriol Vinyals Simon Osindero Karen Simonyan Jack W. Rae Erich Elsen and Laurent Sifre. 2022. Improving language models by retrieving from trillions of tokens. arXiv:2112.04426 [cs.CL]"},{"key":"e_1_3_2_1_8_1","first-page":"1119 10","volume-title":"Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing, Hang Li and Llu\u00eds M\u00e0rquez (Eds.). Association for Computational Linguistics","author":"Bu Fan","year":"2010","unstructured":"Fan Bu, Xingwei Zhu, Yu Hao, and Xiaoyan Zhu. 2010. Function-Based Question Classification for General QA. In Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing, Hang Li and Llu\u00eds M\u00e0rquez (Eds.). Association for Computational Linguistics, Cambridge, MA, 1119-1128. https:\/\/aclanthology.org\/D10-1109"},{"key":"e_1_3_2_1_9_1","volume-title":"MS MARCO: A Human Generated MAchine Reading COmprehension Dataset. ArXiv abs\/1611.09268","author":"Campos Daniel Fernando","year":"2016","unstructured":"Daniel Fernando Campos, Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, Rangan Majumder, Li Deng, and Bhaskar Mitra. 2016. MS MARCO: A Human Generated MAchine Reading COmprehension Dataset. ArXiv abs\/1611.09268 (2016). https:\/\/api.semanticscholar.org\/CorpusID:1289517"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.91"},{"key":"e_1_3_2_1_11_1","unstructured":"Xiang Deng Yu Gu Boyuan Zheng Shijie Chen Samuel Stevens Boshi Wang Huan Sun and Yu Su. 2023. Mind2Web: Towards a Generalist Agent for the Web. arXiv:2306.06070 [cs.CL]"},{"key":"e_1_3_2_1_12_1","unstructured":"Elvis Dohmatob Yunzhen Feng Pu Yang Francois Charton and Julia Kempe. 2024. A Tale of Tails: Model Collapse as a Change of Scaling Laws. arXiv:2402.07043 [cs.LG]"},{"key":"e_1_3_2_1_13_1","unstructured":"Matthew Dunn Levent Sagun Mike Higgins V. Ugur Guney Volkan Cirik and Kyunghyun Cho. 2017. SearchQA: A New Q&A Dataset Augmented with Context from a Search Engine. arXiv:1704.05179 [cs.CL]"},{"key":"e_1_3_2_1_14_1","unstructured":"Brian Everitt. 1974. Cluster analysis. Heinemann Educational [for] the Social Science Research Council."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p19-1346"},{"key":"e_1_3_2_1_16_1","unstructured":"Adam Fourney Gagan Bansal Hussein Mozannar Cheng Tan Eduardo Salinas Erkang Zhu Friederike Niedtner Grace Proebsting Griffin Bassman Jack Gerrits Jacob Alber Peter Chang Ricky Loynd RobertWest Victor Dibia Ahmed Awadallah Ece Kamar Rafah Hosn and Saleema Amershi. 2024. Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks. arXiv:2411.04468 [cs.AI] https:\/\/arxiv.org\/abs\/2411.04468"},{"key":"e_1_3_2_1_17_1","volume-title":"The Pile: An 800GB Dataset of Diverse Text for Language Modeling. arXiv:2101.00027 [cs.CL]","author":"Gao Leo","year":"2020","unstructured":"Leo Gao, Stella Biderman, Sid Black, Laurence Golding, Travis Hoppe, Charles Foster, Jason Phang, Horace He, Anish Thite, Noa Nabeshima, Shawn Presser, and Connor Leahy. 2020. The Pile: An 800GB Dataset of Diverse Text for Language Modeling. arXiv:2101.00027 [cs.CL]"},{"key":"e_1_3_2_1_18_1","volume-title":"Mustafa Safdari, Yutaka Matsuo, Douglas Eck, and Aleksandra Faust.","author":"Gur Izzeddin","year":"2023","unstructured":"Izzeddin Gur, Hiroki Furuta, Austin Huang, Mustafa Safdari, Yutaka Matsuo, Douglas Eck, and Aleksandra Faust. 2023. A Real-WorldWebAgent with Planning, Long Context Understanding, and Program Synthesis. arXiv:2307.12856 [cs.LG]"},{"key":"e_1_3_2_1_19_1","volume-title":"REALM: Retrieval-Augmented Language Model Pre-Training. arXiv:2002.08909 [cs.CL]","author":"Guu Kelvin","year":"2020","unstructured":"Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat, and Ming-Wei Chang. 2020. REALM: Retrieval-Augmented Language Model Pre-Training. arXiv:2002.08909 [cs.CL]"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acllong"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/2556195.2556221"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/2661829.2661912"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.74"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2019.2921572"},{"key":"e_1_3_2_1_25_1","volume-title":"Article arXiv:1705.03551","author":"Joshi Mandar","year":"2017","unstructured":"Mandar Joshi, Eunsol Choi, Daniel Weld, and Luke Zettlemoyer. 2017. triviaqa: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension. arXiv e-prints, Article arXiv:1705.03551 (2017), arXiv:1705.03551 pages. arXiv:1705.03551"},{"key":"e_1_3_2_1_26_1","unstructured":"Daniel Kahneman. 2011. Thinking Fast and Slow. Farrar Straus and Giroux New York."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Vladimir Karpukhin Barlas Oguz Sewon Min Patrick Lewis Ledell Wu Sergey Edunov Danqi Chen and Wen tau Yih. 2020. Dense Passage Retrieval for Open- Domain Question Answering. arXiv:2004.04906 [cs.CL]","DOI":"10.18653\/v1\/2020.emnlp-main.550"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/2808194.2809465"},{"key":"e_1_3_2_1_29_1","volume-title":"David Hall, Percy Liang, Christopher Potts, and Matei Zaharia.","author":"Khattab Omar","year":"2023","unstructured":"Omar Khattab, Keshav Santhanam, Xiang Lisa Li, David Hall, Percy Liang, Christopher Potts, and Matei Zaharia. 2023. Demonstrate-Search-Predict: Composing retrieval and language models for knowledge-intensive NLP. arXiv:2212.14024 [cs.CL]"},{"key":"e_1_3_2_1_30_1","volume-title":"Decomposed Prompting: A Modular Approach for Solving Complex Tasks. arXiv:2210.02406 [cs.CL]","author":"Khot Tushar","year":"2023","unstructured":"Tushar Khot, Harsh Trivedi, Matthew Finlayson, Yao Fu, Kyle Richardson, Peter Clark, and Ashish Sabharwal. 2023. Decomposed Prompting: A Modular Approach for Solving Complex Tasks. arXiv:2210.02406 [cs.CL]"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Kalpesh Krishna Aurko Roy and Mohit Iyyer. 2021. Hurdles to Progress in Long-form Question Answering. arXiv:2103.06332 [cs.CL]","DOI":"10.18653\/v1\/2021.naacl-main.393"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Sayali Kulkarni Sheide Chammas Wan Zhu Fei Sha and Eugene Ie. 2020. AQua-MuSe: Automatically Generating Datasets for Query-Based Multi-Document Summarization. arXiv:2010.12694 [cs.CL]","DOI":"10.1007\/978-3-030-86331-9_6"},{"key":"e_1_3_2_1_33_1","volume-title":"Natural Questions: a Benchmark for Question Answering Research. Transactions of the Association of Computational Linguistics","author":"Kwiatkowski Tom","year":"2019","unstructured":"Tom Kwiatkowski, Jennimaria Palomaki, Olivia Redfield, Michael Collins, Ankur Parikh, Chris Alberti, Danielle Epstein, Illia Polosukhin, Matthew Kelcey, Jacob Devlin, Kenton Lee, Kristina N. Toutanova, Llion Jones, Ming-Wei Chang, Andrew Dai, Jakob Uszkoreit, Quoc Le, and Slav Petrov. 2019. Natural Questions: a Benchmark for Question Answering Research. Transactions of the Association of Computational Linguistics (2019)."},{"key":"e_1_3_2_1_34_1","volume-title":"Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela.","author":"Lewis Patrick","year":"2021","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen tau Yih, Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela. 2021. Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. arXiv:2005.11401 [cs.CL]"},{"key":"e_1_3_2_1_35_1","volume-title":"Hashimoto","author":"Li Xuechen","year":"2023","unstructured":"Xuechen Li, Tianyi Zhang, Yann Dubois, Rohan Taori, Ishaan Gulrajani, Carlos Guestrin, Percy Liang, and Tatsunori B. Hashimoto. 2023. AlpacaEval: An Automatic Evaluator of Instruction-following Models. https:\/\/github.com\/tatsulab\/alpaca_eval."},{"key":"e_1_3_2_1_36_1","unstructured":"Nelson F. Liu Kevin Lin John Hewitt Ashwin Paranjape Michele Bevilacqua Fabio Petroni and Percy Liang. 2023. Lost in the Middle: How Language Models Use Long Contexts. arXiv:2307.03172 [cs.CL]"},{"key":"e_1_3_2_1_37_1","unstructured":"Nelson F. Liu Tianyi Zhang and Percy Liang. 2023. Evaluating Verifiability in Generative Search Engines. arXiv:2304.09848 [cs.CL]"},{"key":"e_1_3_2_1_38_1","unstructured":"Xiao Liu Hao Yu Hanchen Zhang Yifan Xu Xuanyu Lei Hanyu Lai Yu Gu Hangliang Ding Kaiwen Men Kejuan Yang Shudan Zhang Xiang Deng Aohan Zeng Zhengxiao Du Chenhui Zhang Sheng Shen Tianjun Zhang Yu Su Huan Sun Minlie Huang Yuxiao Dong and Jie Tang. 2023. AgentBench: Evaluating LLMs as Agents. arXiv:2308.03688 [cs.AI]"},{"key":"e_1_3_2_1_39_1","unstructured":"Gr\u00e9goire Mialon Cl\u00e9mentine Fourrier Craig Swift Thomas Wolf Yann LeCun and Thomas Scialom. 2023. GAIA: a benchmark for General AI Assistants. arXiv:2311.12983 [cs.CL]"},{"key":"e_1_3_2_1_40_1","unstructured":"Reiichiro Nakano Jacob Hilton Suchir Balaji Jeff Wu Long Ouyang Christina Kim Christopher Hesse Shantanu Jain Vineet Kosaraju William Saunders Xu Jiang Karl Cobbe Tyna Eloundou Gretchen Krueger Kevin Button Matthew Knight Benjamin Chess and John Schulman. 2022. WebGPT: Browser-assisted question-answering with human feedback. arXiv:2112.09332 [cs.CL]"},{"key":"e_1_3_2_1_41_1","unstructured":"OpenAI Josh Achiam Steven Adler and Sandhini Agarwal et al. 2023. GPT-4 Technical Report. arXiv:2303.08774 [cs.CL]"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Arnold Overwijk Chenyan Xiong Xiao Liu Cameron VandenBerg and Jamie Callan. 2022. ClueWeb22: 10 Billion Web Documents with Visual and Semantic Information. arXiv:2211.15848 [cs.IR]","DOI":"10.1145\/3477495.3536321"},{"key":"e_1_3_2_1_43_1","volume-title":"Instruction Tuning with GPT-4. arXiv preprint arXiv:2304.03277","author":"Peng Baolin","year":"2023","unstructured":"Baolin Peng, Chunyuan Li, Pengcheng He, Michel Galley, and Jianfeng Gao. 2023. Instruction Tuning with GPT-4. arXiv preprint arXiv:2304.03277 (2023)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Long Phan Alice Gatti and Ziwen Han et al. 2025. Humanity's Last Exam. arXiv:2501.14249 [cs.LG] https:\/\/arxiv.org\/abs\/2501.14249","DOI":"10.70777\/si.v2i1.13973"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Ofir Press Muru Zhang Sewon Min Ludwig Schmidt Noah A. Smith and Mike Lewis. 2023. Measuring and Narrowing the Compositionality Gap in Language Models. arXiv:2210.03350 [cs.CL]","DOI":"10.18653\/v1\/2023.findings-emnlp.378"},{"key":"e_1_3_2_1_46_1","volume-title":"Tamera Lanham, Tim Maxwell, Venkatesa Chandrasekaran, Zac Hatfield-Dodds, Jared Kaplan, Jan Brauner, Samuel R. Bowman, and Ethan Perez.","author":"Radhakrishnan Ansh","year":"2023","unstructured":"Ansh Radhakrishnan, Karina Nguyen, Anna Chen, Carol Chen, Carson Denison, Danny Hernandez, Esin Durmus, Evan Hubinger, Jackson Kernion, Kamil\u0117 Luko\u0161i\u016bt\u0117, Newton Cheng, Nicholas Joseph, Nicholas Schiefer, Oliver Rausch, Sam McCandlish, Sheer El Showk, Tamera Lanham, Tim Maxwell, Venkatesa Chandrasekaran, Zac Hatfield-Dodds, Jared Kaplan, Jan Brauner, Samuel R. Bowman, and Ethan Perez. 2023. Question Decomposition Improves the Faithfulness of Model-Generated Reasoning. arXiv:2307.11768 [cs.CL]"},{"key":"e_1_3_2_1_47_1","volume-title":"Liu","author":"Raffel Colin","year":"2023","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. 2023. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. arXiv:1910.10683 [cs.LG]"},{"key":"e_1_3_2_1_48_1","volume-title":"Qiaoqiao She, Hua Wu, Haifeng Wang, and Ji-Rong Wen.","author":"Ren Ruiyang","year":"2023","unstructured":"Ruiyang Ren, Yingqi Qu, Jing Liu, Wayne Xin Zhao, Qiaoqiao She, Hua Wu, Haifeng Wang, and Ji-Rong Wen. 2023. RocketQAv2: A Joint Training Method for Dense Passage Retrieval and Passage Re-ranking. arXiv:2110.07367 [cs.CL]"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Zhihong Shao Yeyun Gong Yelong Shen Minlie Huang Nan Duan and Weizhu Chen. 2023. Enhancing Retrieval-Augmented Large Language Models with Iterative Retrieval-Generation Synergy. arXiv:2305.15294 [cs.CL]","DOI":"10.18653\/v1\/2023.findings-emnlp.620"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1098\/rsfs.2015.0039"},{"key":"e_1_3_2_1_51_1","volume-title":"The Black Swan","author":"Taleb Nassim Nicholas","unstructured":"Nassim Nicholas Taleb. 2008. The Black Swan. Penguin Books, Harlow, England."},{"key":"e_1_3_2_1_52_1","volume-title":"MuSiQue: Multihop Questions via Single-hop Question Composition. Transactions of the Association for Computational Linguistics","author":"Trivedi Harsh","year":"2022","unstructured":"Harsh Trivedi, Niranjan Balasubramanian, Tushar Khot, and Ashish Sabharwal. 2022. MuSiQue: Multihop Questions via Single-hop Question Composition. Transactions of the Association for Computational Linguistics (2022)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Harsh Trivedi Niranjan Balasubramanian Tushar Khot and Ashish Sabharwal. 2023. Interleaving Retrieval with Chain-of-Thought Reasoning for Knowledge- Intensive Multi-Step Questions. arXiv:2212.10509 [cs.CL]","DOI":"10.18653\/v1\/2023.acl-long.557"},{"key":"e_1_3_2_1_54_1","volume-title":"Subcommittee on Space Science Technology, and Applications","author":"House Committee on Science United States Congress","year":"1981","unstructured":"House Committee on Science United States Congress, Subcommittee on Space Science Technology, and Applications. 1981. NASA Program Management and Procurement Procedures and Practices: Hearings Before the Subcommittee on Space Science and Applications of the Committee on Science and Technology, U.S. House of Representatives, Ninety-seventh Congress, First Session. U.S. Government Printing Office, Washington, D.C."},{"key":"e_1_3_2_1_55_1","volume-title":"Chi, Quoc Le, and Denny Zhou","author":"Wei Jason","year":"2023","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Brian Ichter, Fei Xia, Ed Chi, Quoc Le, and Denny Zhou. 2023. Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. arXiv:2201.11903 [cs.CL]"},{"key":"e_1_3_2_1_56_1","unstructured":"Jiayang Wu Wensheng Gan Zefeng Chen Shicheng Wan and Hong Lin. 2023. AI-Generated Content (AIGC): A Survey. arXiv:2304.06632 [cs.AI]"},{"key":"e_1_3_2_1_57_1","volume-title":"Ryen W White, Doug Burger, and Chi Wang.","author":"Wu Qingyun","year":"2023","unstructured":"Qingyun Wu, Gagan Bansal, Jieyu Zhang, Yiran Wu, Beibin Li, Erkang Zhu, Li Jiang, Xiaoyun Zhang, Shaokun Zhang, Jiale Liu, Ahmed Hassan Awadallah, Ryen W White, Doug Burger, and Chi Wang. 2023. AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation. arXiv:2308.08155 [cs.AI]"},{"key":"e_1_3_2_1_58_1","unstructured":"Lee Xiong Chenyan Xiong Ye Li Kwok-Fung Tang Jialin Liu Paul Bennett Junaid Ahmed and Arnold Overwijk. 2020. Approximate Nearest Neighbor Negative Contrastive Learning for Dense Text Retrieval. arXiv:2007.00808 [cs.IR]"},{"key":"e_1_3_2_1_59_1","volume-title":"Manning","author":"Yang Zhilin","year":"2018","unstructured":"Zhilin Yang, Peng Qi, Saizheng Zhang, Yoshua Bengio, WilliamW. Cohen, Ruslan Salakhutdinov, and Christopher D. Manning. 2018. HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering. arXiv:1809.09600 [cs.CL]"},{"key":"e_1_3_2_1_60_1","unstructured":"Shunyu Yao Dian Yu Jeffrey Zhao Izhak Shafran Thomas L. Griffiths Yuan Cao and Karthik Narasimhan. 2023. Tree of Thoughts: Deliberate Problem Solving with Large Language Models. arXiv:2305.10601 [cs.CL]"},{"key":"e_1_3_2_1_61_1","unstructured":"Shunyu Yao Jeffrey Zhao Dian Yu Nan Du Izhak Shafran Karthik Narasimhan and Yuan Cao. 2023. ReAct: Synergizing Reasoning and Acting in Language Models. arXiv:2210.03629 [cs.CL]"},{"key":"e_1_3_2_1_62_1","volume-title":"Kyunghyun Cho, Xian Li, Sainbayar Sukhbaatar, Jing Xu, and Jason Weston.","author":"Yuan Weizhe","year":"2024","unstructured":"Weizhe Yuan, Richard Yuanzhe Pang, Kyunghyun Cho, Xian Li, Sainbayar Sukhbaatar, Jing Xu, and Jason Weston. 2024. Self-Rewarding Language Models. arXiv:2401.10020 [cs.CL]"},{"key":"e_1_3_2_1_63_1","volume-title":"Beam Retrieval: General End-to-End Retrieval for Multi-Hop Question Answering. arXiv:2308.08973 [cs.CL]","author":"Zhang Jiahao","year":"2023","unstructured":"Jiahao Zhang, Haiyang Zhang, Dongmei Zhang, Yong Liu, and Shen Huang. 2023. Beam Retrieval: General End-to-End Retrieval for Multi-Hop Question Answering. arXiv:2308.08973 [cs.CL]"},{"key":"e_1_3_2_1_64_1","unstructured":"Xiang Zhang Junbo Zhao and Yann LeCun. 2016. Character-level Convolutional Networks for Text Classification. arXiv:1509.01626 [cs.LG]"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.217"},{"key":"e_1_3_2_1_66_1","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric P. Xing Hao Zhang Joseph E. Gonzalez and Ion Stoica. 2023. Judging LLM-as-a-Judge with MTBench and Chatbot Arena. arXiv:2306.05685 [cs.CL]"},{"key":"e_1_3_2_1_67_1","unstructured":"Shen Zheng Jie Huang and Kevin Chen-Chuan Chang. 2023. Why Does ChatGPT Fall Short in Providing Truthful Answers? arXiv:2304.10513 [cs.CL]"},{"key":"e_1_3_2_1_68_1","volume-title":"Xu Chen, Yankai Lin, Ji-Rong Wen, and Jiawei Han.","author":"Zhou Kun","year":"2023","unstructured":"Kun Zhou, Yutao Zhu, Zhipeng Chen, Wentong Chen, Wayne Xin Zhao, Xu Chen, Yankai Lin, Ji-Rong Wen, and Jiawei Han. 2023. Don't Make Your LLM an Evaluation Benchmark Cheater. arXiv:2311.01964 [cs.CL]"}],"event":{"name":"SIGIR '25: The 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Padua Italy","acronym":"SIGIR '25","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3726302.3730275","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T10:04:15Z","timestamp":1755857055000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3726302.3730275"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,13]]},"references-count":68,"alternative-id":["10.1145\/3726302.3730275","10.1145\/3726302"],"URL":"https:\/\/doi.org\/10.1145\/3726302.3730275","relation":{},"subject":[],"published":{"date-parts":[[2025,7,13]]},"assertion":[{"value":"2025-07-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}