{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,18]],"date-time":"2026-05-18T11:28:46Z","timestamp":1779103726916,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758307","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:38Z","timestamp":1761377198000},"page":"13435-13441","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["RealFactBench: A Benchmark for Evaluating Large Language Models in Real-World Fact-Checking"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1638-9623","authenticated-orcid":false,"given":"Shuo","family":"Yang","sequence":"first","affiliation":[{"name":"The University of Hong Kong, Hong Kong SAR, China and Ant Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3460-6844","authenticated-orcid":false,"given":"Yuqin","family":"Dai","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China and Ant Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8371-0285","authenticated-orcid":false,"given":"Guoqing","family":"Wang","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1130-7916","authenticated-orcid":false,"given":"Xinran","family":"Zheng","sequence":"additional","affiliation":[{"name":"University College London, London, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7876-3740","authenticated-orcid":false,"given":"Jinfeng","family":"Xu","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong SAR, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6749-5442","authenticated-orcid":false,"given":"Jinze","family":"Li","sequence":"additional","affiliation":[{"name":"The University of HongKong, Hong Kong SAR, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9624-3776","authenticated-orcid":false,"given":"Zhenzhe","family":"Ying","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6159-619X","authenticated-orcid":false,"given":"Weiqiang","family":"Wang","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3454-8731","authenticated-orcid":false,"given":"Edith C. H.","family":"Ngai","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong SAR, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Altman, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Altman, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","unstructured":"Moonshot AI. [n.d.]. Moonshot-V1. https:\/\/platform.moonshot.ai\/. Accessed: 2025-05-10."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i12.26648"},{"key":"e_1_3_2_1_4_1","unstructured":"Anthropic. 2025. Claude 3.7 Sonnet and Claude Code. https:\/\/www.anthropic.com\/news\/claude-3-7-sonnet\/. Accessed: 2025-05-10."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00486"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-024-00881-z"},{"key":"e_1_3_2_1_7_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan et al. 2023. Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"FactBench: A Dynamic Benchmark for In-the-Wild Language Model Factuality Evaluation. arXiv preprint arXiv:2410.22257","author":"Bayat Farima Fatahi","year":"2024","unstructured":"Farima Fatahi Bayat, Lechen Zhang, Sheza Munir, and Lu Wang. 2024. FactBench: A Dynamic Benchmark for In-the-Wild Language Model Factuality Evaluation. arXiv preprint arXiv:2410.22257 (2024)."},{"key":"e_1_3_2_1_9_1","volume-title":"LRQ-Fact: LLM-Generated Relevant Questions for Multimodal Fact-Checking. arXiv preprint arXiv:2410.04616","author":"Beigi Alimohammad","year":"2024","unstructured":"Alimohammad Beigi, Bohan Jiang, Dawei Li, Tharindu Kumarage, Zhen Tan, Pouya Shaeri, and Huan Liu. 2024. LRQ-Fact: LLM-Generated Relevant Questions for Multimodal Fact-Checking. arXiv preprint arXiv:2410.04616 (2024)."},{"key":"e_1_3_2_1_10_1","unstructured":"I Chern Steffi Chern Shiqi Chen Weizhe Yuan Kehua Feng Chunting Zhou Junxian He Graham Neubig Pengfei Liu et al. 2023. FacTool: Factuality Detection in Generative AI-A Tool Augmented Framework for Multi-Task and Multi-Domain Scenarios. arXiv preprint arXiv:2307.13528 (2023)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589335.3651504"},{"key":"e_1_3_2_1_12_1","volume-title":"Careful Queries","author":"Dai Yuqin","year":"2025","unstructured":"Yuqin Dai, Shuo Yang, Guoqing Wang, Yong Deng, and Others. 2025. Careful Queries, Credible Results: Teaching RAG Models Advanced Web Search Tools with Reinforcement Learning. arXiv preprint arXiv:2508.07956 (2025)."},{"key":"e_1_3_2_1_13_1","unstructured":"Google DeepMind. 2025. Gemini 2.5 Flash Preview: Model Card. Technical Report. Google. Accessed: 2025-05-10."},{"key":"e_1_3_2_1_14_1","volume-title":"A survey on the optimization of large language model-based agents. arXiv preprint arXiv:2503.12434","author":"Du Shangheng","year":"2025","unstructured":"Shangheng Du, Jiabao Zhao, Jinxin Shi, Zhentao Xie, Xin Jiang, Yanhong Bai, and Liang He. 2025. A survey on the optimization of large language model-based agents. arXiv preprint arXiv:2503.12434 (2025)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00454"},{"key":"e_1_3_2_1_16_1","volume-title":"Language models as knowledge bases: On entity representations, storage capacity, and paraphrased queries. arXiv preprint arXiv:2008.09036","author":"Heinzerling Benjamin","year":"2020","unstructured":"Benjamin Heinzerling and Kentaro Inui. 2020. Language models as knowledge bases: On entity representations, storage capacity, and paraphrased queries. arXiv preprint arXiv:2008.09036 (2020)."},{"key":"e_1_3_2_1_17_1","volume-title":"Do large language models know about facts? arXiv preprint arXiv:2310.05177","author":"Hu Xuming","year":"2023","unstructured":"Xuming Hu, Junzhe Chen, Xiaochuan Li, Yufei Guo, Lijie Wen, Philip S Yu, and Zhijiang Guo. 2023. Do large language models know about facts? arXiv preprint arXiv:2310.05177 (2023)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11156-023-01239-z"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i28.35354"},{"key":"e_1_3_2_1_20_1","volume-title":"Jian-Yun Nie, and Ji-Rong Wen.","author":"Li Junyi","year":"2023","unstructured":"Junyi Li, Xiaoxue Cheng, Wayne Xin Zhao, Jian-Yun Nie, and Ji-Rong Wen. 2023. Halueval: A large-scale hallucination evaluation benchmark for large language models. arXiv preprint arXiv:2305.11747 (2023)."},{"key":"e_1_3_2_1_21_1","volume-title":"Large language model agent for fake news detection. arXiv preprint arXiv:2405.01593","author":"Li Xinyi","year":"2024","unstructured":"Xinyi Li, Yongfeng Zhang, and Edward C Malthouse. 2024a. Large language model agent for fake news detection. arXiv preprint arXiv:2405.01593 (2024)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.3233\/FAIA240787"},{"key":"e_1_3_2_1_23_1","volume-title":"FACT-AUDIT: An Adaptive Multi-Agent Framework for Dynamic Fact-Checking Evaluation of Large Language Models. arXiv preprint arXiv:2502.17924","author":"Lin Hongzhan","year":"2025","unstructured":"Hongzhan Lin, Yang Deng, Yuxuan Gu, Wenxuan Zhang, Jing Ma, See-Kiong Ng, and Tat-Seng Chua. 2025. FACT-AUDIT: An Adaptive Multi-Agent Framework for Dynamic Fact-Checking Evaluation of Large Language Models. arXiv preprint arXiv:2502.17924 (2025)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.229"},{"key":"e_1_3_2_1_25_1","unstructured":"Aixin Liu Bei Feng Bing Xue Bingxuan Wang Bochao Wu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan et al. 2024a. Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437 (2024)."},{"key":"e_1_3_2_1_26_1","volume-title":"Mmfakebench: A mixed-source multimodal misinformation detection benchmark for lvlms. arXiv preprint arXiv:2406.08772","author":"Liu Xuannan","year":"2024","unstructured":"Xuannan Liu, Zekun Li, Peipei Li, Huaibo Huang, Shuhan Xia, Xing Cui, Linzhi Huang, Weihong Deng, and Zhaofeng He. 2024b. Mmfakebench: A mixed-source multimodal misinformation detection benchmark for lvlms. arXiv preprint arXiv:2406.08772 (2024)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3696410.3714748"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2019.112986"},{"key":"e_1_3_2_1_29_1","unstructured":"Meta. 2024. Introducing Llama 3.1: Our most capable models to date. https:\/\/ai.meta.com\/blog\/meta-llama-3-1\/. Accessed: 2025-05-10."},{"key":"e_1_3_2_1_30_1","volume-title":"Mohit Iyyer, Luke Zettlemoyer, and Hannaneh Hajishirzi.","author":"Min Sewon","year":"2023","unstructured":"Sewon Min, Kalpesh Krishna, Xinxi Lyu, Mike Lewis, Wen-tau Yih, Pang Wei Koh, Mohit Iyyer, Luke Zettlemoyer, and Hannaneh Hajishirzi. 2023. Factscore: Fine-grained atomic evaluation of factual precision in long form text generation. arXiv preprint arXiv:2305.14251 (2023)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.5812\/ijhrba-142986"},{"key":"e_1_3_2_1_32_1","volume-title":"William Yang Wang, Min-Yen Kan, and Preslav Nakov.","author":"Pan Liangming","year":"2023","unstructured":"Liangming Pan, Xiaobao Wu, Xinyuan Lu, Anh Tuan Luu, William Yang Wang, Min-Yen Kan, and Preslav Nakov. 2023. Fact-checking complex claims with program-guided reasoning. arXiv preprint arXiv:2305.12744 (2023)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.165"},{"key":"e_1_3_2_1_34_1","volume-title":"Exploring the deceptive power of llm-generated fake news: A study of real-world detection challenges. arXiv preprint arXiv:2403.18249","author":"Sun Yanshen","year":"2024","unstructured":"Yanshen Sun, Jianfeng He, Limeng Cui, Shuo Lei, and Chang-Tien Lu. 2024. Exploring the deceptive power of llm-generated fake news: A study of real-world detection challenges. arXiv preprint arXiv:2403.18249 (2024)."},{"key":"e_1_3_2_1_35_1","volume-title":"FEVER: a large-scale dataset for fact extraction and VERification. arXiv preprint arXiv:1803.05355","author":"Thorne James","year":"2018","unstructured":"James Thorne, Andreas Vlachos, Christos Christodoulopoulos, and Arpit Mittal. 2018. FEVER: a large-scale dataset for fact extraction and VERification. arXiv preprint arXiv:1803.05355 (2018)."},{"key":"e_1_3_2_1_36_1","volume-title":"Generative large language models in automated fact-checking: A survey. arXiv preprint arXiv:2407.02351","author":"Vykopal Ivan","year":"2024","unstructured":"Ivan Vykopal, Mat\u00fa\u0161 Pikuliak, Simon Ostermann, and Mari\u00e1n \u0160imko. 2024. Generative large language models in automated fact-checking: A survey. arXiv preprint arXiv:2407.02351 (2024)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.609"},{"key":"e_1_3_2_1_38_1","volume-title":"MFC-Bench: Benchmarking Multimodal Fact-Checking with Large Vision-Language Models. arXiv preprint arXiv:2406.11288","author":"Wang Shengkang","year":"2024","unstructured":"Shengkang Wang, Hongzhan Lin, Ziyang Luo, Zhen Ye, Guang Chen, and Jing Ma. 2024a. MFC-Bench: Benchmarking Multimodal Fact-Checking with Large Vision-Language Models. arXiv preprint arXiv:2406.11288 (2024)."},{"key":"e_1_3_2_1_39_1","volume-title":"Zain Mujahid, Arnav Arora, Aleksandr Rubashevskii, Jiahui Geng, Osama Mohammed Afzal, Liangming Pan, Nadav Borenstein, Aditya Pillai, et al.","author":"Wang Yuxia","year":"2024","unstructured":"Yuxia Wang, Revanth Gangi Reddy, Zain Mujahid, Arnav Arora, Aleksandr Rubashevskii, Jiahui Geng, Osama Mohammed Afzal, Liangming Pan, Nadav Borenstein, Aditya Pillai, et al., 2024b. Factcheck-Bench: Fine-Grained Evaluation Benchmark for Automatic Fact-checkers. In Findings of the Association for Computational Linguistics: EMNLP 2024. 14199-14230."},{"key":"e_1_3_2_1_40_1","first-page":"80756","article-title":"Long-form factuality in large language models","volume":"37","author":"Wei Jerry","year":"2024","unstructured":"Jerry Wei, Chengrun Yang, Xinying Song, Yifeng Lu, Nathan Hu, Jie Huang, Dustin Tran, Daiyi Peng, et al., 2024. Long-form factuality in large language models. Advances in Neural Information Processing Systems, Vol. 37, 80756-80827.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_41_1","volume-title":"Ngai","author":"Yang Shuo","year":"2025","unstructured":"Shuo Yang, Yuqin Dai, Guoqing Wang, Xinran Zheng, Jinfeng Xu, Jinze Li, Zhenzhe Ying, Weiqiang Wang, and Edith C. H. Ngai. 2025a. RealFactBench: A Benchmark for Evaluating Large Language Models in Real-World Fact-Checking. arXiv preprint arXiv:2506.12538 (2025)."},{"key":"e_1_3_2_1_42_1","volume-title":"Ngai","author":"Yang Shuo","year":"2025","unstructured":"Shuo Yang, Zijian Yu, Zhenzhe Ying, Yuqin Dai, Guoqing Wang, Jun Lan, Jinfeng Xu, Jinze Li, and Edith C. H. Ngai. 2025b. RAMA: Retrieval-Augmented Multi-Agent Framework for Misinformation Detection in Multimodal Fact-Checking. arXiv preprint arXiv:2507.09174 (2025)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.551"},{"key":"e_1_3_2_1_44_1","volume-title":"TrendFact: A Benchmark for Explainable Hotspot Perception in Fact-Checking with Natural Language Explanation. arXiv preprint arXiv:2410.15135","author":"Zhang Xiaocheng","year":"2024","unstructured":"Xiaocheng Zhang, Xi Wang, Yifei Lu, Jianing Wang, Zhuangzhuang Ye, Mengjiao Bao, Peng Yan, and Xiaohong Su. 2024. TrendFact: A Benchmark for Explainable Hotspot Perception in Fact-Checking with Natural Language Explanation. arXiv preprint arXiv:2410.15135 (2024)."},{"key":"e_1_3_2_1_45_1","first-page":"44502","article-title":"Felm: Benchmarking factuality evaluation of large language models","volume":"36","author":"Zhao Yiran","year":"2023","unstructured":"Yiran Zhao, Jinghan Zhang, I Chern, Siyang Gao, Pengfei Liu, Junxian He, et al., 2023. Felm: Benchmarking factuality evaluation of large language models. Advances in Neural Information Processing Systems, Vol. 36 (2023), 44502-44523.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_46_1","first-page":"46595","article-title":"Judging llm-as-a-judge with mt-bench and chatbot arena","volume":"36","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric Xing, et al., 2023. Judging llm-as-a-judge with mt-bench and chatbot arena. Advances in Neural Information Processing Systems, Vol. 36, 46595-46623.","journal-title":"Advances in Neural Information Processing Systems"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758307","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:45:35Z","timestamp":1765309535000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758307"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":46,"alternative-id":["10.1145\/3746027.3758307","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758307","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}