{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:27:16Z","timestamp":1778081236779,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,8,24]],"date-time":"2024-08-24T00:00:00Z","timestamp":1724457600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-sa\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,8,25]]},"DOI":"10.1145\/3637528.3671656","type":"proceedings-article","created":{"date-parts":[[2024,8,25]],"date-time":"2024-08-25T04:55:12Z","timestamp":1724561712000},"page":"6083-6094","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Face4Rag: Factual Consistency Evaluation for Retrieval Augmented Generation in Chinese"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-4472-2930","authenticated-orcid":false,"given":"Yunqi","family":"Xu","sequence":"first","affiliation":[{"name":"Ant Group, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1503-6519","authenticated-orcid":false,"given":"Tianchi","family":"Cai","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1083-2834","authenticated-orcid":false,"given":"Jiyan","family":"Jiang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4580-1683","authenticated-orcid":false,"given":"Xierui","family":"Song","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2024,8,24]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"2019. https:\/\/gaokao.neea.edu.cn\/xhtml1\/report\/19012\/5987-1.htm."},{"key":"e_1_3_2_2_2_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_2_4_1","unstructured":"Yushi Bai Jiahao Ying Yixin Cao Xin Lv Yuze He Xiaozhi Wang Jifan Yu Kaisheng Zeng Yijia Xiao Haozhe Lyu et al. 2023. Benchmarking Foundation Models with Language-Model-as-an-Examiner. arXiv preprint arXiv:2306.04181 (2023)."},{"key":"e_1_3_2_2_5_1","volume-title":"Baichuan 2: Open Large-scale Language Models. arXiv preprint arXiv:2309.10305","year":"2023","unstructured":"Baichuan. 2023. Baichuan 2: Open Large-scale Language Models. arXiv preprint arXiv:2309.10305 (2023). https:\/\/arxiv.org\/abs\/2309.10305"},{"key":"e_1_3_2_2_6_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877--1901."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-5817"},{"key":"e_1_3_2_2_8_1","volume-title":"Reading wikipedia to answer open-domain questions. arXiv preprint arXiv:1704.00051","author":"Chen Danqi","year":"2017","unstructured":"Danqi Chen, Adam Fisch, Jason Weston, and Antoine Bordes. 2017. Reading wikipedia to answer open-domain questions. arXiv preprint arXiv:1704.00051 (2017)."},{"key":"e_1_3_2_2_9_1","volume-title":"Felm: Benchmarking factuality evaluation of large language models. arXiv preprint arXiv:2310.00741","author":"Chen Shiqi","year":"2023","unstructured":"Shiqi Chen, Yiran Zhao, Jinghan Zhang, I Chern, Siyang Gao, Pengfei Liu, Junxian He, et al. 2023. Felm: Benchmarking factuality evaluation of large language models. arXiv preprint arXiv:2310.00741 (2023)."},{"key":"e_1_3_2_2_10_1","unstructured":"I Chern Steffi Chern Shiqi Chen Weizhe Yuan Kehua Feng Chunting Zhou Junxian He Graham Neubig Pengfei Liu et al. 2023. FacTool: Factuality Detection in Generative AI-A Tool Augmented Framework for Multi-Task and Multi-Domain Scenarios. arXiv preprint arXiv:2307.13528 (2023)."},{"key":"e_1_3_2_2_11_1","volume-title":"Efficient and Effective Text Encoding for Chinese LLaMA and Alpaca. arXiv preprint arXiv:2304.08177","author":"Cui Yiming","year":"2023","unstructured":"Yiming Cui, Ziqing Yang, and Xin Yao. 2023. Efficient and Effective Text Encoding for Chinese LLaMA and Alpaca. arXiv preprint arXiv:2304.08177 (2023). https:\/\/arxiv.org\/abs\/2304.08177"},{"key":"e_1_3_2_2_12_1","volume-title":"Proceedings of acl-08: Hlt. 1039--1047","author":"De Marneffe Marie-Catherine","year":"2008","unstructured":"Marie-Catherine De Marneffe, Anna N Rafferty, and Christopher D Manning. 2008. Finding contradictions in text. In Proceedings of acl-08: Hlt. 1039--1047."},{"key":"e_1_3_2_2_13_1","volume-title":"Ragas: Automated evaluation of retrieval augmented generation. arXiv preprint arXiv:2309.15217","author":"Es Shahul","year":"2023","unstructured":"Shahul Es, Jithin James, Luis Espinosa-Anke, and Steven Schockaert. 2023. Ragas: Automated evaluation of retrieval augmented generation. arXiv preprint arXiv:2309.15217 (2023)."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00373"},{"key":"e_1_3_2_2_15_1","volume-title":"Human-like summarization evaluation with chatgpt. arXiv preprint arXiv:2304.02554","author":"Gao Mingqi","year":"2023","unstructured":"Mingqi Gao, Jie Ruan, Renliang Sun, Xunjian Yin, Shiping Yang, and Xiaojun Wan. 2023. Human-like summarization evaluation with chatgpt. arXiv preprint arXiv:2304.02554 (2023)."},{"key":"e_1_3_2_2_16_1","volume-title":"DialFact: A benchmark for fact-checking in dialogue. arXiv preprint arXiv:2110.08222","author":"Gupta Prakhar","year":"2021","unstructured":"Prakhar Gupta, Chien-Sheng Wu, Wenhao Liu, and Caiming Xiong. 2021. DialFact: A benchmark for fact-checking in dialogue. arXiv preprint arXiv:2110.08222 (2021)."},{"key":"e_1_3_2_2_17_1","volume-title":"Evaluating Factual Consistency in Knowledge-Grounded Dialogues via Question Generation and Question Answering. arXiv preprint arXiv:2104.08202","author":"Honovich Or","year":"2021","unstructured":"Or Honovich, Leshem Choshen, Roee Aharoni, Ella Neeman, Idan Szpektor, and Omri Abend. 2021. Q^{2}: Evaluating Factual Consistency in Knowledge-Grounded Dialogues via Question Generation and Question Answering. arXiv preprint arXiv:2104.08202 (2021)."},{"key":"e_1_3_2_2_18_1","unstructured":"Xiangkun Hu Dongyu Ru Qipeng Guo Lin Qiu and Zheng Zhang. 2023. RefChecker for Fine-grained Hallucination Detection. (2023). https:\/\/github.com\/amazon-science\/RefChecker"},{"key":"e_1_3_2_2_19_1","volume-title":"Leveraging passage retrieval with generative models for open domain question answering. arXiv preprint arXiv:2007.01282","author":"Izacard Gautier","year":"2020","unstructured":"Gautier Izacard and Edouard Grave. 2020. Leveraging passage retrieval with generative models for open domain question answering. arXiv preprint arXiv:2007.01282 (2020)."},{"key":"e_1_3_2_2_20_1","volume-title":"Semantic structures","author":"Jackendoff Ray S","unstructured":"Ray S Jackendoff. 1992. Semantic structures. Vol. 18. MIT press."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3571730"},{"key":"e_1_3_2_2_22_1","volume-title":"Logical fallacy detection. arXiv preprint arXiv:2202.13758","author":"Jin Zhijing","year":"2022","unstructured":"Zhijing Jin, Abhinav Lalwani, Tejas Vaidhya, Xiaoyu Shen, Yiwen Ding, Zhiheng Lyu, Mrinmaya Sachan, Rada Mihalcea, and Bernhard Sch\u00f6lkopf. 2022. Logical fallacy detection. arXiv preprint arXiv:2202.13758 (2022)."},{"key":"e_1_3_2_2_23_1","volume-title":"Juan Diego Rodriguez, and Greg Durrett","author":"Kamoi Ryo","year":"2023","unstructured":"Ryo Kamoi, Tanya Goyal, Juan Diego Rodriguez, and Greg Durrett. 2023. Wice: Real-world entailment for claims in wikipedia. arXiv preprint arXiv:2303.01432 (2023)."},{"key":"e_1_3_2_2_24_1","volume-title":"Evaluating the factual consistency of abstractive text summarization. arXiv preprint arXiv:1910.12840","author":"Kry\u015bci\u0144ski Wojciech","year":"2019","unstructured":"Wojciech Kry\u015bci\u0144ski, Bryan McCann, Caiming Xiong, and Richard Socher. 2019. Evaluating the factual consistency of abstractive text summarization. arXiv preprint arXiv:1910.12840 (2019)."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00453"},{"key":"e_1_3_2_2_26_1","volume-title":"Fast and Accurate Factual Inconsistency Detection Over Long Documents. arXiv preprint arXiv:2310.13189","author":"Lattimer Barrett Martin","year":"2023","unstructured":"Barrett Martin Lattimer, Patrick Chen, Xinyuan Zhang, and Yi Yang. 2023. Fast and Accurate Factual Inconsistency Detection Over Long Documents. arXiv preprint arXiv:2310.13189 (2023)."},{"key":"e_1_3_2_2_27_1","first-page":"9459","article-title":"Retrieval-augmented generation for knowledge-intensive nlp tasks","volume":"33","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, et al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in Neural Information Processing Systems, Vol. 33 (2020), 9459--9474.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2022.03.001"},{"key":"e_1_3_2_2_29_1","volume-title":"Evaluating verifiability in generative search engines. arXiv preprint arXiv:2304.09848","author":"Liu Nelson F","year":"2023","unstructured":"Nelson F Liu, Tianyi Zhang, and Percy Liang. 2023. Evaluating verifiability in generative search engines. arXiv preprint arXiv:2304.09848 (2023)."},{"key":"e_1_3_2_2_30_1","volume-title":"On faithfulness and factuality in abstractive summarization. arXiv preprint arXiv:2005.00661","author":"Maynez Joshua","year":"2020","unstructured":"Joshua Maynez, Shashi Narayan, Bernd Bohnet, and Ryan McDonald. 2020. On faithfulness and factuality in abstractive summarization. arXiv preprint arXiv:2005.00661 (2020)."},{"key":"e_1_3_2_2_31_1","volume-title":"Mohit Iyyer, Luke Zettlemoyer, and Hannaneh Hajishirzi.","author":"Min Sewon","year":"2023","unstructured":"Sewon Min, Kalpesh Krishna, Xinxi Lyu, Mike Lewis, Wen-tau Yih, Pang Wei Koh, Mohit Iyyer, Luke Zettlemoyer, and Hannaneh Hajishirzi. 2023. FActScore: Fine-grained Atomic Evaluation of Factual Precision in Long Form Text Generation. arXiv preprint arXiv:2305.14251 (2023)."},{"key":"e_1_3_2_2_32_1","volume-title":"Generating benchmarks for factuality evaluation of language models. arXiv preprint arXiv:2307.06908","author":"Muhlgay Dor","year":"2023","unstructured":"Dor Muhlgay, Ori Ram, Inbal Magar, Yoav Levine, Nir Ratner, Yonatan Belinkov, Omri Abend, Kevin Leyton-Brown, Amnon Shashua, and Yoav Shoham. 2023. Generating benchmarks for factuality evaluation of language models. arXiv preprint arXiv:2307.06908 (2023)."},{"key":"e_1_3_2_2_33_1","unstructured":"OpenAI. 2022. Chatgpt blog post. https:\/\/openai.com\/blog\/chatgpt."},{"key":"e_1_3_2_2_34_1","volume-title":"Understanding factuality in abstractive summarization with FRANK: A benchmark for factuality metrics. arXiv preprint arXiv:2104.13346","author":"Pagnoni Artidoro","year":"2021","unstructured":"Artidoro Pagnoni, Vidhisha Balachandran, and Yulia Tsvetkov. 2021. Understanding factuality in abstractive summarization with FRANK: A benchmark for factuality metrics. arXiv preprint arXiv:2104.13346 (2021)."},{"key":"e_1_3_2_2_35_1","volume-title":"doi","author":"Petric Domina","year":"2020","unstructured":"Domina Petric. 2020. Logical Fallacies. On-line Article (preprint), doi, Vol. 10 (2020)."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"crossref","unstructured":"Yujia Qin Zihan Cai Dian Jin Lan Yan Shihao Liang Kunlun Zhu Yankai Lin Xu Han Ning Ding Huadong Wang et al. 2023. WebCPM: Interactive Web Search for Chinese Long-form Question Answering. arXiv preprint arXiv:2305.06849 (2023).","DOI":"10.18653\/v1\/2023.acl-long.499"},{"key":"e_1_3_2_2_37_1","volume-title":"Get your vitamin C! robust fact verification with contrastive evidence. arXiv preprint arXiv:2103.08541","author":"Schuster Tal","year":"2021","unstructured":"Tal Schuster, Adam Fisch, and Regina Barzilay. 2021. Get your vitamin C! robust fact verification with contrastive evidence. arXiv preprint arXiv:2103.08541 (2021)."},{"key":"e_1_3_2_2_38_1","volume-title":"Stephen Roller, Megan Ung, Moya Chen, Kushal Arora, Joshua Lane, et al.","author":"Shuster Kurt","year":"2022","unstructured":"Kurt Shuster, Jing Xu, Mojtaba Komeili, Da Ju, Eric Michael Smith, Stephen Roller, Megan Ung, Moya Chen, Kushal Arora, Joshua Lane, et al. 2022. Blenderbot 3: a deployed conversational agent that continually learns to responsibly engage. arXiv preprint arXiv:2208.03188 (2022)."},{"key":"e_1_3_2_2_39_1","volume-title":"Understanding factual errors in summarization: Errors, summarizers, datasets, error detectors. arXiv preprint arXiv:2205.12854","author":"Tang Liyan","year":"2022","unstructured":"Liyan Tang, Tanya Goyal, Alexander R Fabbri, Philippe Laban, Jiacheng Xu, Semih Yavuz, Wojciech Kry\u015bci\u0144ski, Justin F Rousseau, and Greg Durrett. 2022. Understanding factual errors in summarization: Errors, summarizers, datasets, error detectors. arXiv preprint arXiv:2205.12854 (2022)."},{"key":"e_1_3_2_2_40_1","volume-title":"A gold standard methodology for evaluating accuracy in data-to-text systems. arXiv preprint arXiv:2011.03992","author":"Thomson Craig","year":"2020","unstructured":"Craig Thomson and Ehud Reiter. 2020. A gold standard methodology for evaluating accuracy in data-to-text systems. arXiv preprint arXiv:2011.03992 (2020)."},{"key":"e_1_3_2_2_41_1","volume-title":"Jamie Hall, Noam Shazeer, Apoorv Kulshreshtha, Heng-Tze Cheng, Alicia Jin, Taylor Bos, Leslie Baker, Yu Du, et al.","author":"Thoppilan Romal","year":"2022","unstructured":"Romal Thoppilan, Daniel De Freitas, Jamie Hall, Noam Shazeer, Apoorv Kulshreshtha, Heng-Tze Cheng, Alicia Jin, Taylor Bos, Leslie Baker, Yu Du, et al. 2022. Lamda: Language models for dialog applications. arXiv preprint arXiv:2201.08239 (2022)."},{"key":"e_1_3_2_2_42_1","volume-title":"Evaluating open question answering evaluation. arXiv preprint arXiv:2305.12421","author":"Wang Cunxiang","year":"2023","unstructured":"Cunxiang Wang, Sirui Cheng, Zhikun Xu, Bowen Ding, Yidong Wang, and Yue Zhang. 2023. Evaluating open question answering evaluation. arXiv preprint arXiv:2305.12421 (2023)."},{"key":"e_1_3_2_2_43_1","first-page":"24824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume":"35","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in Neural Information Processing Systems, Vol. 35 (2022), 24824--24837.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_44_1","unstructured":"Aohan Zeng Xiao Liu Zhengxiao Du Zihan Wang Hanyu Lai Ming Ding Zhuoyi Yang Yifan Xu Wendi Zheng Xiao Xia et al. 2022. Glm-130b: An open bilingual pre-trained model. arXiv preprint arXiv:2210.02414 (2022)."},{"key":"e_1_3_2_2_45_1","volume-title":"AlignScore: Evaluating Factual Consistency with a Unified Alignment Function. arXiv preprint arXiv:2305.16739","author":"Zha Yuheng","year":"2023","unstructured":"Yuheng Zha, Yichi Yang, Ruichen Li, and Zhiting Hu. 2023. AlignScore: Evaluating Factual Consistency with a Unified Alignment Function. arXiv preprint arXiv:2305.16739 (2023)."},{"key":"e_1_3_2_2_46_1","volume-title":"Bertscore: Evaluating text generation with bert. arXiv preprint arXiv:1904.09675","author":"Zhang Tianyi","year":"2019","unstructured":"Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian Q Weinberger, and Yoav Artzi. 2019. Bertscore: Evaluating text generation with bert. arXiv preprint arXiv:1904.09675 (2019)."},{"key":"e_1_3_2_2_47_1","volume-title":"Annotating and Detecting Fine-grained Factual Errors for Dialogue Summarization. arXiv preprint arXiv:2305.16548","author":"Zhu Rongxin","year":"2023","unstructured":"Rongxin Zhu, Jianzhong Qi, and Jey Han Lau. 2023. Annotating and Detecting Fine-grained Factual Errors for Dialogue Summarization. arXiv preprint arXiv:2305.16548 (2023)."}],"event":{"name":"KDD '24: The 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Barcelona Spain","acronym":"KDD '24","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"]},"container-title":["Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3637528.3671656","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3637528.3671656","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:06:00Z","timestamp":1750291560000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3637528.3671656"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,24]]},"references-count":46,"alternative-id":["10.1145\/3637528.3671656","10.1145\/3637528"],"URL":"https:\/\/doi.org\/10.1145\/3637528.3671656","relation":{},"subject":[],"published":{"date-parts":[[2024,8,24]]},"assertion":[{"value":"2024-08-24","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}