{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,21]],"date-time":"2026-07-21T13:27:04Z","timestamp":1784640424288,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,13]]},"DOI":"10.1145\/3726302.3730340","type":"proceedings-article","created":{"date-parts":[[2025,7,14]],"date-time":"2025-07-14T01:38:52Z","timestamp":1752457132000},"page":"3606-3615","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":13,"title":["LexRAG: Benchmarking Retrieval-Augmented Generation in Multi-Turn Legal Consultation Conversation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-8766-8610","authenticated-orcid":false,"given":"Haitao","family":"Li","sequence":"first","affiliation":[{"name":"DCST, Tsinghua University, Beijing, China and Quan Cheng Laboratory, Shandong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5364-4175","authenticated-orcid":false,"given":"Yifan","family":"Chen","sequence":"additional","affiliation":[{"name":"DCST, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-7986-3692","authenticated-orcid":false,"given":"Hu","family":"YiRan","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5030-709X","authenticated-orcid":false,"given":"Qingyao","family":"Ai","sequence":"additional","affiliation":[{"name":"Quan Cheng Laboratory, Shandong, China and DCST, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8883-9673","authenticated-orcid":false,"given":"Junjie","family":"Chen","sequence":"additional","affiliation":[{"name":"Quan Cheng Laboratory, Shandong, China and DCST, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1521-1360","authenticated-orcid":false,"given":"Xiaoyu","family":"Yang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3547-0472","authenticated-orcid":false,"given":"Jianhui","family":"Yang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2539-8954","authenticated-orcid":false,"given":"Yueyue","family":"Wu","sequence":"additional","affiliation":[{"name":"Quan Cheng Laboratory, Shandong, China and DCST, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6849-8675","authenticated-orcid":false,"given":"Zeyang","family":"Liu","sequence":"additional","affiliation":[{"name":"Quan Cheng Laboratory, Shandong, China and Shandong University, Shandong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0140-4512","authenticated-orcid":false,"given":"Yiqun","family":"Liu","sequence":"additional","affiliation":[{"name":"Quan Cheng Laboratory, Shandong, China and DCST, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,7,13]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Self-rag: Learning to retrieve, generate, and critique through self-reflection. arXiv preprint arXiv:2310.11511","author":"Asai Akari","year":"2023","unstructured":"Akari Asai, ZeqiuWu, YizhongWang, Avirup Sil, and Hannaneh Hajishirzi. 2023. Self-rag: Learning to retrieve, generate, and critique through self-reflection. arXiv preprint arXiv:2310.11511 (2023)."},{"key":"e_1_3_2_1_3_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang et al. 2023. Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65-72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65-72."},{"key":"e_1_3_2_1_5_1","volume-title":"Daniel Martin Katz, and Nikolaos Aletras","author":"Chalkidis Ilias","year":"2021","unstructured":"Ilias Chalkidis, Abhik Jana, Dirk Hartung, Michael Bommarito, Ion Androutsopoulos, Daniel Martin Katz, and Nikolaos Aletras. 2021. LexGLUE: A benchmark dataset for legal language understanding in English. arXiv preprint arXiv:2110.00976 (2021)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3166054.3166058"},{"key":"e_1_3_2_1_7_1","volume-title":"Bge m3-embedding: Multi-lingual, multi-functionality, multi-granularity text embeddings through self-knowledge distillation. arXiv preprint arXiv:2402.03216","author":"Chen Jianlv","year":"2024","unstructured":"Jianlv Chen, Shitao Xiao, Peitian Zhang, Kun Luo, Defu Lian, and Zheng Liu. 2024. Bge m3-embedding: Multi-lingual, multi-functionality, multi-granularity text embeddings through self-knowledge distillation. arXiv preprint arXiv:2402.03216 (2024)."},{"key":"e_1_3_2_1_8_1","volume-title":"PRE: A Peer Review Based Large Language Model Evaluator. arXiv:2401.15641 [cs.IR] https:\/\/arxiv.org\/abs\/2401.15641","author":"Chu Zhumin","year":"2024","unstructured":"Zhumin Chu, Qingyao Ai, Yiteng Tu, Haitao Li, and Yiqun Liu. 2024. PRE: A Peer Review Based Large Language Model Evaluator. arXiv:2401.15641 [cs.IR] https:\/\/arxiv.org\/abs\/2401.15641"},{"key":"e_1_3_2_1_9_1","volume-title":"ChatLaw: Open-Source Legal Large Language Model with Integrated External Knowledge Bases. arXiv e-prints","author":"Cui Jiaxi","year":"2023","unstructured":"Jiaxi Cui, Zongjian Li, Yang Yan, Bohua Chen, and Li Yuan. 2023. ChatLaw: Open-Source Legal Large Language Model with Integrated External Knowledge Bases. arXiv e-prints (2023), arXiv-2306."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671470"},{"key":"e_1_3_2_1_11_1","volume-title":"Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv preprint arXiv:2406.12793","author":"Aohan Zeng Team GLM","year":"2024","unstructured":"Team GLM, Aohan Zeng, Bin Xu, Bowen Wang, Chenhui Zhang, Da Yin, Dan Zhang, Diego Rojas, Guanyu Feng, Hanlin Zhao, et al. 2024. Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv preprint arXiv:2406.12793 (2024)."},{"key":"e_1_3_2_1_12_1","first-page":"20179","article-title":"A simple language model for task-oriented dialogue","volume":"33","author":"Hosseini-Asl Ehsan","year":"2020","unstructured":"Ehsan Hosseini-Asl, Bryan McCann, Chien-ShengWu, Semih Yavuz, and Richard Socher. 2020. A simple language model for task-oriented dialogue. Advances in Neural Information Processing Systems 33 (2020), 20179-20191.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3383123"},{"key":"e_1_3_2_1_14_1","unstructured":"Jiajie Jin Yutao Zhu Xinyu Yang Chenghao Zhang and Zhicheng Dou. 2024. FlashRAG: A Modular Toolkit for Efficient Retrieval-Augmented Generation Research. arXiv:2405.13576 [cs.CL] https:\/\/arxiv.org\/abs\/2405.13576"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2019.2921572"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_17_1","volume-title":"Large language models in law: A survey. AI Open","author":"Lai Jinqi","year":"2024","unstructured":"Jinqi Lai, Wensheng Gan, Jiayang Wu, Zhenlian Qi, and S Yu Philip. 2024. Large language models in law: A survey. AI Open (2024)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3591761"},{"key":"e_1_3_2_1_19_1","volume-title":"BLADE: Enhancing Black-box Large Language Models with Small Domain-Specific Models. arXiv preprint arXiv:2403.18365","author":"Li Haitao","year":"2024","unstructured":"Haitao Li, Qingyao Ai, Jia Chen, Qian Dong, ZhijingWu, Yiqun Liu, Chong Chen, and Qi Tian. 2024. BLADE: Enhancing Black-box Large Language Models with Small Domain-Specific Models. arXiv preprint arXiv:2403.18365 (2024)."},{"key":"e_1_3_2_1_20_1","volume-title":"Lexilaw: A Scalable Legal Language Model for Comprehensive Legal Understanding. https:\/\/github.com\/CSHaitao\/LexiLaw","author":"Li Haitao","year":"2024","unstructured":"Haitao Li, Qingyao Ai, Qian Dong, and Yiqun Liu. 2024. Lexilaw: A Scalable Legal Language Model for Comprehensive Legal Understanding. https:\/\/github.com\/CSHaitao\/LexiLaw"},{"key":"e_1_3_2_1_21_1","volume-title":"DELTA: Pre-train a Discriminative Encoder for Legal Case Retrieval via Structural Word Alignment. arXiv preprint arXiv:2403.18435","author":"Li Haitao","year":"2024","unstructured":"Haitao Li, Qingyao Ai, Xinyan Han, Jia Chen, Qian Dong, Yiqun Liu, Chong Chen, and Qi Tian. 2024. DELTA: Pre-train a Discriminative Encoder for Legal Case Retrieval via Structural Word Alignment. arXiv preprint arXiv:2403.18435 (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"Calibraeval: Calibrating prediction distribution to mitigate selection bias in llms-as-judges. arXiv preprint arXiv:2410.15393","author":"Li Haitao","year":"2024","unstructured":"Haitao Li, Junjie Chen, Qingyao Ai, Zhumin Chu, Yujia Zhou, Qian Dong, and Yiqun Liu. 2024. Calibraeval: Calibrating prediction distribution to mitigate selection bias in llms-as-judges. arXiv preprint arXiv:2410.15393 (2024)."},{"key":"e_1_3_2_1_23_1","unstructured":"Haitao Li Junjie Chen Jingli Yang Qingyao Ai Wei Jia Youfeng Liu Kai Lin Yueyue Wu Guozhi Yuan Yiran Hu et al. 2024. LegalAgentBench: Evaluating LLM Agents in Legal Domain. arXiv preprint arXiv:2412.17259 (2024)."},{"key":"e_1_3_2_1_24_1","volume-title":"Lexeval: A comprehensive chinese legal benchmark for evaluating large language models. arXiv preprint arXiv:2409.20288","author":"Li Haitao","year":"2024","unstructured":"Haitao Li, You Chen, Qingyao Ai, Yueyue Wu, Ruizhe Zhang, and Yiqun Liu. 2024. Lexeval: A comprehensive chinese legal benchmark for evaluating large language models. arXiv preprint arXiv:2409.20288 (2024)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-97-3076-6_15"},{"key":"e_1_3_2_1_26_1","volume-title":"Llms-as-judges: a comprehensive survey on llm-based evaluation methods. arXiv preprint arXiv:2412.05579","author":"Li Haitao","year":"2024","unstructured":"Haitao Li, Qian Dong, Junjie Chen, Huixue Su, Yujia Zhou, Qingyao Ai, Ziyi Ye, and Yiqun Liu. 2024. Llms-as-judges: a comprehensive survey on llm-based evaluation methods. arXiv preprint arXiv:2412.05579 (2024)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657887"},{"key":"e_1_3_2_1_28_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74-81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74-81."},{"key":"e_1_3_2_1_29_1","volume-title":"Pyserini: An Easy-to-Use Python Toolkit to Support Replicable IR Research with Sparse and Dense Representations. arXiv:2102.10073 [cs.IR] https:\/\/arxiv.org\/abs\/2102.10073","author":"Lin Jimmy","year":"2021","unstructured":"Jimmy Lin, Xueguang Ma, Sheng-Chieh Lin, Jheng-Hong Yang, Ronak Pradeep, and Rodrigo Nogueira. 2021. Pyserini: An Easy-to-Use Python Toolkit to Support Replicable IR Research with Sparse and Dense Representations. arXiv:2102.10073 [cs.IR] https:\/\/arxiv.org\/abs\/2102.10073"},{"key":"e_1_3_2_1_30_1","volume-title":"Recent advances in deep learning based dialogue systems: A systematic survey. Artificial intelligence review 56, 4","author":"Ni Jinjie","year":"2023","unstructured":"Jinjie Ni, Tom Young, Vlad Pandelea, Fuzhao Xue, and Erik Cambria. 2023. Recent advances in deep learning based dialogue systems: A systematic survey. Artificial intelligence review 56, 4 (2023), 3055-3155."},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Stephen Robertson Hugo Zaragoza et al. 2009. The probabilistic relevance framework: BM25 and beyond. Foundations and Trends\u00ae in Information Retrieval 3 4 (2009) 333-389.","DOI":"10.1561\/1500000019"},{"key":"e_1_3_2_1_33_1","volume-title":"A comprehensive survey of hallucination mitigation techniques in large language models. arXiv preprint arXiv:2401.01313","author":"Tonmoy SM","year":"2024","unstructured":"SM Tonmoy, SM Zaman, Vinija Jain, Anku Rani, Vipula Rawte, Aman Chadha, and Amitava Das. 2024. A comprehensive survey of hallucination mitigation techniques in large language models. arXiv preprint arXiv:2401.01313 (2024)."},{"key":"e_1_3_2_1_34_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.210"},{"key":"e_1_3_2_1_36_1","volume-title":"Conference on learning theory. PMLR, 25-54","author":"Li Yuanzhi","year":"2013","unstructured":"YiningWang, LiweiWang, Yuanzhi Li, Di He, and Tie-Yan Liu. 2013. A theoretical analysis of NDCG type ranking measures. In Conference on learning theory. PMLR, 25-54."},{"key":"e_1_3_2_1_37_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems 35 (2022), 24824-24837."},{"key":"e_1_3_2_1_38_1","volume-title":"Advanced statistics: understanding medical record review (MRR) studies","author":"Worster Andrew","year":"2004","unstructured":"Andrew Worster and Ted Haines. 2004. Advanced statistics: understanding medical record review (MRR) studies. Academic emergency medicine 11, 2 (2004), 187-192."},{"key":"e_1_3_2_1_39_1","unstructured":"Chaojun Xiao Haoxi Zhong Zhipeng Guo Cunchao Tu Zhiyuan Liu Maosong Sun Yansong Feng Xianpei Han Zhen Hu Heng Wang et al. 2018. Cail2018: A large-scale legal dataset for judgment prediction. arXiv preprint arXiv:1807.02478 (2018)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3591874"},{"key":"e_1_3_2_1_41_1","volume-title":"A Survey on Recent Advances in LLM-Based Multi-turn Dialogue Systems. arXiv preprint arXiv:2402.18013","author":"Yi Zihao","year":"2024","unstructured":"Zihao Yi, Jiarui Ouyang, Yuwen Liu, Tianhao Liao, Zhe Xu, and Ying Shen. 2024. A Survey on Recent Advances in LLM-Based Multi-turn Dialogue Systems. arXiv preprint arXiv:2402.18013 (2024)."},{"key":"e_1_3_2_1_42_1","volume-title":"Disc-lawllm: Fine-tuning large language models for intelligent legal services. arXiv preprint arXiv:2309.11325","author":"Yue Shengbin","year":"2023","unstructured":"Shengbin Yue, Wei Chen, Siyuan Wang, Bingxuan Li, Chenchen Shen, Shujun Liu, Yuxuan Zhou, Yao Xiao, Song Yun, Xuanjing Huang, et al. 2023. Disc-lawllm: Fine-tuning large language models for intelligent legal services. arXiv preprint arXiv:2309.11325 (2023)."},{"key":"e_1_3_2_1_43_1","volume-title":"Statistical language models for information retrieval. Synthesis lectures on human language technologies 1, 1","author":"Zhai ChengXiang","year":"2008","unstructured":"ChengXiang Zhai. 2008. Statistical language models for information retrieval. Synthesis lectures on human language technologies 1, 1 (2008), 1-141."},{"key":"e_1_3_2_1_44_1","volume-title":"Evaluation ethics of llms in legal domain. arXiv preprint arXiv:2403.11152","author":"Zhang Ruizhe","year":"2024","unstructured":"Ruizhe Zhang, Haitao Li, Yueyue Wu, Qingyao Ai, Yiqun Liu, Min Zhang, and Shaoping Ma. 2024. Evaluation ethics of llms in legal domain. arXiv preprint arXiv:2403.11152 (2024)."},{"key":"e_1_3_2_1_45_1","volume-title":"Bertscore: Evaluating text generation with bert. arXiv preprint arXiv:1904.09675","author":"Zhang Tianyi","year":"2019","unstructured":"Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian Q Weinberger, and Yoav Artzi. 2019. Bertscore: Evaluating text generation with bert. arXiv preprint arXiv:1904.09675 (2019)."},{"key":"e_1_3_2_1_46_1","volume-title":"Kd-Conv: A Chinese multi-domain dialogue dataset towards multi-turn knowledgedriven conversation. arXiv preprint arXiv:2004.04100","author":"Zhou Hao","year":"2020","unstructured":"Hao Zhou, Chujie Zheng, Kaili Huang, Minlie Huang, and Xiaoyan Zhu. 2020. Kd-Conv: A Chinese multi-domain dialogue dataset towards multi-turn knowledgedriven conversation. arXiv preprint arXiv:2004.04100 (2020)."}],"event":{"name":"SIGIR '25: The 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Padua Italy","acronym":"SIGIR '25","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3726302.3730340","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:06:37Z","timestamp":1755864397000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3726302.3730340"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,13]]},"references-count":46,"alternative-id":["10.1145\/3726302.3730340","10.1145\/3726302"],"URL":"https:\/\/doi.org\/10.1145\/3726302.3730340","relation":{},"subject":[],"published":{"date-parts":[[2025,7,13]]},"assertion":[{"value":"2025-07-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}