{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,17]],"date-time":"2026-06-17T21:26:39Z","timestamp":1781731599637,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":141,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100006374","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2421839"],"award-info":[{"award-number":["2421839"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,3]]},"DOI":"10.1145\/3711896.3736569","type":"proceedings-article","created":{"date-parts":[[2025,8,3]],"date-time":"2025-08-03T20:52:41Z","timestamp":1754254361000},"page":"6107-6117","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":27,"title":["Uncertainty Quantification and Confidence Calibration in Large Language Models: A Survey"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-1082-8326","authenticated-orcid":false,"given":"Xiaoou","family":"Liu","sequence":"first","affiliation":[{"name":"Arizona State University, Tempe, AZ, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1013-0924","authenticated-orcid":false,"given":"Tiejin","family":"Chen","sequence":"additional","affiliation":[{"name":"Arizona State University, Tempe, AZ, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8631-9634","authenticated-orcid":false,"given":"Longchao","family":"Da","sequence":"additional","affiliation":[{"name":"Arizona State University, Tempe, AZ, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6101-2150","authenticated-orcid":false,"given":"Chacha","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Chicago, Chicago, IL, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8673-6868","authenticated-orcid":false,"given":"Zhen","family":"Lin","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign, Champaign, IL, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3735-1635","authenticated-orcid":false,"given":"Hua","family":"Wei","sequence":"additional","affiliation":[{"name":"Arizona State University, Tempe, AZ, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,8,3]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774(2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Amos Azaria and Tom Mitchell. 2023. The Internal State of an LLM Knows When It`s Lying. (2023) 967-976.","DOI":"10.18653\/v1\/2023.findings-emnlp.68"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.124"},{"key":"e_1_3_2_1_4_1","unstructured":"Oleksandr Balabanov and Hampus Linander. 2024. Uncertainty quantification in fine-tuned LLMs using LoRA ensembles. arXiv preprint arXiv:2402.12264(2024)."},{"key":"e_1_3_2_1_5_1","unstructured":"Evan Becker and Stefano Soatto. 2024. Cycles of thought: Measuring llm confidence through stable explanations. arXiv preprint arXiv:2406.03441(2024)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.283"},{"key":"e_1_3_2_1_7_1","volume-title":"ICLR 2025 Workshop on Navigating and Addressing Data Problems for Foundation Models.","author":"Chen Tiejin","year":"2025","unstructured":"Tiejin Chen, Kuan-Ru Liou, Mithun Shivakoti, Aaryan Gaur, Pragya Kumari, Meiqi Guo, and Hua Wei. 2025 a. Abg-SciQA: A dataset for Understanding and Resolving Ambiguity in Scientific Questions. In ICLR 2025 Workshop on Navigating and Addressing Data Problems for Foundation Models."},{"key":"e_1_3_2_1_8_1","unstructured":"Tiejin Chen Xiaoou Liu Longchao Da Jia Chen Vagelis Papalexakis and Hua Wei. 2025 b. Uncertainty Quantification of Large Language Models through Multi-Dimensional Responses. arXiv preprint arXiv:2502.16820(2025)."},{"key":"e_1_3_2_1_9_1","unstructured":"Zizhang Chen Peizhao Li Xiaomeng Dong and Pengyu Hong. 2024. Uncertainty Quantification for Clinical Outcome Predictions with (Large) Language Models. arXiv preprint arXiv:2411.03497(2024)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3630106.3659048"},{"key":"e_1_3_2_1_11_1","first-page":"114812","article-title":"Large language model validity via enhanced conformal prediction methods","volume":"37","author":"Cherian John","year":"2025","unstructured":"John Cherian, Isaac Gibbs, and Emmanuel Candes. 2025. Large language model validity via enhanced conformal prediction methods. Advances in Neural Information Processing Systems, Vol. 37 (2025), 114812-114842.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_12_1","unstructured":"Karl Cobbe Vineet Kosaraju Mohammad Bavarian Mark Chen Heewoo Jun Lukasz Kaiser Matthias Plappert Jerry Tworek Jacob Hilton Reiichiro Nakano Christopher Hesse and John Schulman. 2021. Training Verifiers to Solve Math Word Problems. arXiv preprint arXiv:2110.14168(2021)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.35"},{"key":"e_1_3_2_1_14_1","volume-title":"Selection-Inference: Exploiting Large Language Models for Interpretable Logical Reasoning. In The Eleventh International Conference on Learning Representations.","author":"Creswell Antonia","unstructured":"Antonia Creswell, Murray Shanahan, and Irina Higgins. [n.d.]. Selection-Inference: Exploiting Large Language Models for Interpretable Logical Reasoning. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_15_1","unstructured":"Longchao Da Tiejin Chen Lu Cheng and Hua Wei. 2024a. Llm uncertainty quantification through directional entailment graph and claim level response augmentation. arXiv preprint arXiv:2407.00994(2024)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i1.27758"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/s13042-024-02190-8"},{"key":"e_1_3_2_1_18_1","unstructured":"Longchao Da Xiaoou Liu Jiaxin Dai Lu Cheng Yaqing Wang and Hua Wei. 2025. Understanding the Uncertainty of LLM Explanations: A Perspective Based on Reasoning Topology. arXiv preprint arXiv:2502.17026(2025)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CDC49753.2023.10383645"},{"key":"e_1_3_2_1_20_1","unstructured":"Longchao Da Rui Wang Xiaojian Xu Parminder Bhatia Taha Kass-Hout Hua Wei and Cao Xiao. 2024d. Segment as You Wish-Free-Form Language-Based Segmentation for Medical Images. arXiv preprint arXiv:2410.12831(2024)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.3390\/s23229225"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3592009"},{"key":"e_1_3_2_1_23_1","volume-title":"Aleatory or epistemic? Does it matter? Structural safety","author":"Kiureghian Armen Der","year":"2009","unstructured":"Armen Der Kiureghian and Ove Ditlevsen. 2009. Aleatory or epistemic? Does it matter? Structural safety, Vol. 31, 2 (2009), 105-112."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.276"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00410"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.558"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.41"},{"key":"e_1_3_2_1_28_1","volume-title":"international conference on machine learning. PMLR, 1050-1059","author":"Gal Yarin","year":"2016","unstructured":"Yarin Gal and Zoubin Ghahramani. 2016. Dropout as a bayesian approximation: Representing model uncertainty in deep learning. In international conference on machine learning. PMLR, 1050-1059."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.eacl-long.143"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Mor Geva Daniel Khashabi Elad Segal Tushar Khot Dan Roth and Jonathan Berant. 2021. Did Aristotle Use a Laptop? A Question Answering Benchmark with Implicit Reasoning Strategies. Transactions of the Association for Computational Linguistics(2021) 346-361.","DOI":"10.1162\/tacl_a_00370"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.887"},{"key":"e_1_3_2_1_32_1","volume-title":"ROSCOE: A Suite of Metrics for Scoring Step-by-Step Reasoning. In The Eleventh International Conference on Learning Representations.","author":"Golovneva Olga","unstructured":"Olga Golovneva, Moya Peng Chen, Spencer Poff, Martin Corredor, Luke Zettlemoyer, Maryam Fazel-Zarandi, and Asli Celikyilmaz. [n.d.]. ROSCOE: A Suite of Metrics for Scoring Step-by-Step Reasoning. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_33_1","volume-title":"3rd Conference on Automated Knowledge Base Construction(2021)","author":"Guo Meiqi","year":"2021","unstructured":"Meiqi Guo, Mingda Zhang, Siva Reddy, and Malihe Alikhani. 2021. Abg-coqa: Clarifying ambiguity in conversational question answering. 3rd Conference on Automated Knowledge Base Construction(2021)."},{"key":"e_1_3_2_1_34_1","unstructured":"Dan Hendrycks Collin Burns Steven Basart Andy Zou Mantas Mazeika Dawn Song and Jacob Steinhardt. 2020. Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300(2020)."},{"key":"e_1_3_2_1_35_1","volume-title":"Saehoon Kim, Juho Lee, Kwang Joon Kim, Eunho Yang, and Sung Ju Hwang.","author":"Heo Jay","year":"2018","unstructured":"Jay Heo, Hae Beom Lee, Saehoon Kim, Juho Lee, Kwang Joon Kim, Eunho Yang, and Sung Ju Hwang. 2018. Uncertainty-aware attention for reliable interpretation and prediction. Advances in neural information processing systems, Vol. 31 (2018)."},{"key":"e_1_3_2_1_36_1","volume-title":"Forty-first International Conference on Machine Learning.","author":"Hou Bairu","year":"2024","unstructured":"Bairu Hou, Yujian Liu, Kaizhi Qian, Jacob Andreas, Shiyu Chang, and Yang Zhang. 2024. Decomposing Uncertainty for Large Language Models through Input Clarification Ensembling. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_2_1_37_1","unstructured":"Hsiu-Yuan Huang Yutong Yang Zhaoxi Zhang Sanwoo Lee and Yunfang Wu. 2024b. A survey of uncertainty estimation in llms: Theory meets practice. arXiv preprint arXiv:2410.15326(2024)."},{"key":"e_1_3_2_1_38_1","unstructured":"Jingwang Huang Jiang Zhong Qin Lei Jinpeng Gao Yuming Yang Sirui Wang Peiguang Li and Kaiwen Wei. 2025 b. Latent Distribution Decoupling: A Probabilistic Framework for Uncertainty-Aware Multimodal Emotion Recognition. arXiv preprint arXiv:2502.13954(2025)."},{"key":"e_1_3_2_1_39_1","volume-title":"Chandra Bhagavatula, and Yejin Choi.","author":"Huang Lifu","year":"2019","unstructured":"Lifu Huang, Ronan Le Bras, Chandra Bhagavatula, and Yejin Choi. 2019. Cosmos QA: Machine reading comprehension with contextual commonsense reasoning. arXiv preprint arXiv:1909.00277(2019)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3703155"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.785"},{"key":"e_1_3_2_1_42_1","volume-title":"Aleatoric and epistemic uncertainty in machine learning: An introduction to concepts and methods. Machine learning","author":"H\u00fcllermeier Eyke","year":"2021","unstructured":"Eyke H\u00fcllermeier and Willem Waegeman. 2021. Aleatoric and epistemic uncertainty in machine learning: An introduction to concepts and methods. Machine learning, Vol. 110, 3 (2021), 457-506."},{"key":"e_1_3_2_1_43_1","unstructured":"Fushuo Huo Wenchao Xu Zhong Zhang Haozhao Wang Zhicheng Chen and Peilin Zhao. 2024. Self-introspective decoding: Alleviating hallucinations for large vision-language models. arXiv preprint arXiv:2408.02032(2024)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.188"},{"key":"e_1_3_2_1_45_1","volume-title":"Guan","author":"Jiang Heinrich","year":"2018","unstructured":"Heinrich Jiang, Been Kim, Maya Gupta, and Melody Y. Guan. 2018. To trust or not to trust a classifier. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Zhengbao Jiang Jun Araki Haibo Ding and Graham Neubig. 2021. How Can We Know When Language Models Know? On the Calibration of Language Models for Question Answering. Transactions of the Association for Computational Linguistics(2021) 962-977.","DOI":"10.1162\/tacl_a_00407"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.3390\/app11146421"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1147"},{"key":"e_1_3_2_1_49_1","unstructured":"Saurav Kadavath Tom Conerly Amanda Askell Tom Henighan Dawn Drain Ethan Perez Nicholas Schiefer Zac Hatfield-Dodds Nova DasSarma Eli Tran-Johnson et al. 2022. Language models (mostly) know what they know. arXiv preprint arXiv:2207.05221(2022)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.uncertainlp-1.1"},{"key":"e_1_3_2_1_51_1","volume-title":"Semantic Uncertainty: Linguistic Invariances for Uncertainty Estimation in Natural Language Generation. In The Eleventh International Conference on Learning Representations.","author":"Kuhn Lorenz","year":"2023","unstructured":"Lorenz Kuhn, Yarin Gal, and Sebastian Farquhar. 2023. Semantic Uncertainty: Linguistic Invariances for Uncertainty Estimation in Natural Language Generation. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_52_1","unstructured":"Aviral Kumar and Sunita Sarawagi. 2019. Calibration of encoder decoder models for neural machine translation. arXiv preprint arXiv:1903.00802(2019)."},{"key":"e_1_3_2_1_53_1","unstructured":"Bhawesh Kumar Charlie Lu Gauri Gupta Anil Palepu David Bellamy Ramesh Raskar and Andrew Beam. 2023. Conformal prediction with large language models for multi-choice question answering. arXiv preprint arXiv:2305.18404(2023)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1082"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3690624.3709379"},{"key":"e_1_3_2_1_56_1","volume-title":"Simple and scalable predictive uncertainty estimation using deep ensembles. Advances in neural information processing systems","author":"Lakshminarayanan Balaji","year":"2017","unstructured":"Balaji Lakshminarayanan, Alexander Pritzel, and Charles Blundell. 2017. Simple and scalable predictive uncertainty estimation using deep ensembles. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"Baolin Li Yankai Jiang Vijay Gadepally and Devesh Tiwari. 2024a. Llm inference serving: Survey of recent advances and opportunities. arXiv preprint arXiv:2407.12391(2024).","DOI":"10.1109\/HPEC62836.2024.10938426"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.397"},{"key":"e_1_3_2_1_59_1","volume-title":"Bolin Shen, et al.","author":"Li Lincan","year":"2024","unstructured":"Lincan Li, Jiaqi Li, Catherine Chen, Fred Gui, Hongjia Yang, Chenxiao Yu, Zhengguang Wang, Jianing Cai, Junlong Aaron Zhou, Bolin Shen, et al., 2024b. Political-llm: Large language models in political science. arXiv preprint arXiv:2412.06864(2024)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00268"},{"key":"e_1_3_2_1_61_1","unstructured":"Zixuan Li Jing Xiong Fanghua Ye Chuanyang Zheng Xun Wu Jianqiao Lu Zhongwei Wan Xiaodan Liang Chengming Li Zhenan Sun et al. 2024c. UncertaintyRAG: Span-Level Uncertainty Enhanced Long-Context Modeling for Retrieval-Augmented Generation. arXiv preprint arXiv:2410.02719(2024)."},{"key":"e_1_3_2_1_62_1","first-page":"71998","article-title":"Introspective Planning: Aligning Robots' Uncertainty with Inherent Task Ambiguity","volume":"37","author":"Liang Kaiqu","year":"2024","unstructured":"Kaiqu Liang, Zixu Zhang, and Jaime Fisac. 2024b. Introspective Planning: Aligning Robots' Uncertainty with Inherent Task Ambiguity. Advances in Neural Information Processing Systems, Vol. 37 (2024), 71998-72031.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"crossref","unstructured":"Siyuan Liang Jiawei Liang Tianyu Pang Chao Du Aishan Liu Ee-Chien Chang and Xiaochun Cao. 2024a. Revisiting backdoor attacks against large vision-language models. arXiv preprint arXiv:2406.18844(2024).","DOI":"10.1109\/CVPR52734.2025.00885"},{"key":"e_1_3_2_1_64_1","unstructured":"Stephanie Lin Jacob Hilton and Owain Evans. 2022a. Teaching models to express their uncertainty in words. arXiv preprint arXiv:2205.14334(2022)."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.229"},{"key":"e_1_3_2_1_66_1","unstructured":"Zhen Lin Shubhendu Trivedi and Jimeng Sun. 2023. Generating with Confidence: Uncertainty Quantification for Black-box Large Language Models. Transactions on Machine Learning Research(2023)."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.578"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.184"},{"key":"e_1_3_2_1_69_1","unstructured":"Linyu Liu Yu Pan Xiaocheng Li and Guanting Chen. 2024c. Uncertainty estimation and quantification for llms: A simple supervised approach. arXiv preprint arXiv:2404.15993(2024)."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.1205"},{"key":"e_1_3_2_1_71_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Liu Xin","year":"2024","unstructured":"Xin Liu, Muhammad Khalifa, and Lu Wang. 2024a. LitCab: Lightweight Language Model Calibration over Short- and Long-form Responses. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_72_1","unstructured":"Xiaoou Liu Zhen Lin Longchao Da Chacha Chen Shubhendu Trivedi and Hua Wei. 2025. MCQA-Eval: Efficient Confidence Evaluation in NLG with Gold-Standard Correctness Labels. arXiv preprint arXiv:2502.14268(2025)."},{"key":"e_1_3_2_1_73_1","unstructured":"Jinliang Lu Ziliang Pang Min Xiao Yaochen Zhu Rui Xia and Jiajun Zhang. 2024. Merge ensemble and cooperate! a survey on collaborative strategies in the era of large language models. arXiv preprint arXiv:2407.06089(2024)."},{"key":"e_1_3_2_1_74_1","first-page":"46534","article-title":"Self-refine: Iterative refinement with self-feedback","volume":"36","author":"Madaan Aman","year":"2023","unstructured":"Aman Madaan, Niket Tandon, Prakhar Gupta, Skyler Hallinan, Luyu Gao, Sarah Wiegreffe, Uri Alon, Nouha Dziri, Shrimai Prabhumoye, Yiming Yang, et al., 2023. Self-refine: Iterative refinement with self-feedback. Advances in Neural Information Processing Systems, Vol. 36 (2023), 46534-46594.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_75_1","volume-title":"Uncertainty Estimation in Autoregressive Structured Prediction. In International Conference on Learning Representations.","author":"Malinin Andrey","year":"2021","unstructured":"Andrey Malinin and Mark Gales. 2021. Uncertainty Estimation in Autoregressive Structured Prediction. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_76_1","volume-title":"SelfCheckGPT: Zero-Resource Black-Box Hallucination Detection for Generative Large Language Models. In 2023 Conference on Empirical Methods in Natural Language Processing.","author":"Manakul Potsawee","unstructured":"Potsawee Manakul, Adian Liusie, and Mark Gales. [n.d.]. SelfCheckGPT: Zero-Resource Black-Box Hallucination Detection for Generative Large Language Models. In 2023 Conference on Empirical Methods in Natural Language Processing."},{"key":"e_1_3_2_1_77_1","first-page":"5011","article-title":"Active Learning Principles for In-Context Learning with Large Language Models","author":"Margatina Katerina","year":"2023","unstructured":"Katerina Margatina, Timo Schick, Nikolaos Aletras, and Jane Dwivedi-Yu. 2023. Active Learning Principles for In-Context Learning with Large Language Models. In Findings of the Association for Computational Linguistics. 5011-5034.","journal-title":"Findings of the Association for Computational Linguistics."},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.741"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"crossref","unstructured":"Sewon Min Julian Michael Hannaneh Hajishirzi and Luke Zettlemoyer. 2020. AmbigQA: Answering Ambiguous Open-domain Questions. In EMNLP.","DOI":"10.18653\/v1\/2020.emnlp-main.466"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448355"},{"key":"e_1_3_2_1_81_1","volume-title":"Beyond Accuracy: Evaluating the Reasoning Behavior of Large Language Models-A Survey. In First Conference on Language Modeling.","author":"Mondorf Philipp","year":"2024","unstructured":"Philipp Mondorf and Barbara Plank. 2024. Beyond Accuracy: Evaluating the Reasoning Behavior of Large Language Models-A Survey. In First Conference on Language Modeling."},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-industry.31"},{"key":"e_1_3_2_1_83_1","first-page":"21199","article-title":"Uncertainty-aware self-training for few-shot text classification","volume":"33","author":"Mukherjee Subhabrata","year":"2020","unstructured":"Subhabrata Mukherjee and Ahmed Awadallah. 2020. Uncertainty-aware self-training for few-shot text classification. Advances in Neural Information Processing Systems, Vol. 33 (2020), 21199-21212.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_84_1","unstructured":"James F Mullen Jr and Dinesh Manocha. 2024. LAP Using Action Feasibility for Improved Uncertainty Alignment of Large Language Model Planners. arXiv preprint arXiv:2403.13198(2024)."},{"key":"e_1_3_2_1_85_1","first-page":"41","article-title":"Reliability of Subjective Probability Forecasts of Precipitation and Temperature","volume":"26","author":"Murphy Allan H.","year":"1977","unstructured":"Allan H. Murphy and Robert L. Winkler. 1977. Reliability of Subjective Probability Forecasts of Precipitation and Temperature. Journal of The Royal Statistical Society Series C-applied Statistics, Vol. 26 (1977), 41-47.","journal-title":"Journal of The Royal Statistical Society Series C-applied Statistics"},{"key":"e_1_3_2_1_86_1","first-page":"65","article-title":"Accuracy-rejection curves (ARCs) for comparing classification methods with a reject option","author":"Ahmed Nadeem Malik Sajjad","year":"2009","unstructured":"Malik Sajjad Ahmed Nadeem, Jean-Daniel Zucker, and Blaise Hanczar. 2009. Accuracy-rejection curves (ARCs) for comparing classification methods with a reject option. In Machine Learning in Systems Biology. 65-81.","journal-title":"Machine Learning in Systems Biology."},{"key":"e_1_3_2_1_87_1","first-page":"8901","article-title":"Kernel language entropy: Fine-grained uncertainty quantification for LLMs from semantic similarities","volume":"37","author":"Nikitin Alexander","year":"2024","unstructured":"Alexander Nikitin, Jannik Kossen, Yarin Gal, and Pekka Marttinen. 2024. Kernel language entropy: Fine-grained uncertainty quantification for LLMs from semantic similarities. Advances in Neural Information Processing Systems, Vol. 37 (2024), 8901-8929.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_88_1","doi-asserted-by":"publisher","DOI":"10.1109\/UR61395.2024.10597499"},{"key":"e_1_3_2_1_89_1","first-page":"68772","article-title":"Llm evaluators recognize and favor their own generations","volume":"37","author":"Panickssery Arjun","year":"2024","unstructured":"Arjun Panickssery, Samuel Bowman, and Shi Feng. 2024. Llm evaluators recognize and favor their own generations. Advances in Neural Information Processing Systems, Vol. 37 (2024), 68772-68802.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_90_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318."},{"key":"e_1_3_2_1_91_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-024-00944-1"},{"key":"e_1_3_2_1_92_1","volume-title":"Conformal Language Modeling. In The Twelfth International Conference on Learning Representations.","author":"Quach Victor","unstructured":"Victor Quach, Adam Fisch, Tal Schuster, Adam Yala, Jae Ho Sohn, Tommi S Jaakkola, and Regina Barzilay. [n.d.]. Conformal Language Modeling. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_93_1","doi-asserted-by":"crossref","unstructured":"Pranav Rajpurkar Jian Zhang Konstantin Lopyrev and Percy Liang. 2016. Squad: 100 000 questions for machine comprehension of text. arXiv preprint arXiv:1606.05250(2016).","DOI":"10.18653\/v1\/D16-1264"},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00266"},{"key":"e_1_3_2_1_95_1","volume-title":"First Conference on Language Modeling.","author":"Rein David","year":"2024","unstructured":"David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe Pang, Julien Dirani, Julian Michael, and Samuel R Bowman. 2024. Gpqa: A graduate-level google-proof q&a benchmark. In First Conference on Language Modeling."},{"key":"e_1_3_2_1_96_1","volume-title":"Out-of-Distribution Detection and Selective Generation for Conditional Language Models. In The Eleventh International Conference on Learning Representations.","author":"Ren Jie","year":"2023","unstructured":"Jie Ren, Jiaming Luo, Yao Zhao, Kundan Krishna, Mohammad Saleh, Balaji Lakshminarayanan, and Peter J Liu. 2023a. Out-of-Distribution Detection and Selective Generation for Conditional Language Models. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_97_1","first-page":"49","volume-title":"Proceedings on ''I Can't Believe It's Not Better: Failure Modes in the Age of Foundation Models'' at NeurIPS 2023 Workshops","volume":"239","author":"Ren Jie","year":"2023","unstructured":"Jie Ren, Yao Zhao, Tu Vu, Peter J. Liu, and Balaji Lakshminarayanan. 2023b. Self-Evaluation Improves Selective Generation in Large Language Models. In Proceedings on ''I Can't Believe It's Not Better: Failure Modes in the Age of Foundation Models'' at NeurIPS 2023 Workshops, Vol. 239. 49-64."},{"key":"e_1_3_2_1_98_1","volume-title":"Ali Soroush, and Jonathan H Chen.","author":"Savage Thomas","year":"2024","unstructured":"Thomas Savage, John Wang, Robert Gallo, Abdessalem Boukil, Vishwesh Patel, Seyed Amir Ahmad Safavi-Naini, Ali Soroush, and Jonathan H Chen. 2024. Large language model uncertainty measurement and calibration for medical diagnosis and treatment. medRxiv(2024), 2024-06."},{"key":"e_1_3_2_1_99_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-024-05796-1"},{"key":"e_1_3_2_1_100_1","volume-title":"Evidential deep learning to quantify classification uncertainty. Advances in neural information processing systems","author":"Sensoy Murat","year":"2018","unstructured":"Murat Sensoy, Lance Kaplan, and Melih Kandemir. 2018. Evidential deep learning to quantify classification uncertainty. Advances in neural information processing systems, Vol. 31 (2018)."},{"key":"e_1_3_2_1_101_1","article-title":"A tutorial on conformal prediction","volume":"9","author":"Shafer Glenn","year":"2008","unstructured":"Glenn Shafer and Vladimir Vovk. 2008. A tutorial on conformal prediction. Journal of Machine Learning Research, Vol. 9, 3 (2008).","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_102_1","unstructured":"Ola Shorinwa Zhiting Mei Justin Lidard Allen Z Ren and Anirudha Majumdar. 2024. A survey on uncertainty quantification of large language models: Taxonomy open research challenges and future directions. arXiv preprint arXiv:2412.05563(2024)."},{"key":"e_1_3_2_1_103_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00598"},{"key":"e_1_3_2_1_104_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.54"},{"key":"e_1_3_2_1_105_1","volume-title":"Peng Wang, and Yanning Zhang.","author":"Suo Wei","year":"2025","unstructured":"Wei Suo, Lijun Zhang, Mengyang Sun, Lin Yuanbo Wu, Peng Wang, and Yanning Zhang. 2025. Octopus: Alleviating Hallucination via Dynamic Contrastive Decoding. arXiv preprint arXiv:2503.00361(2025)."},{"key":"e_1_3_2_1_106_1","doi-asserted-by":"crossref","unstructured":"James Thorne Andreas Vlachos Christos Christodoulopoulos and Arpit Mittal. 2018. FEVER: a large-scale dataset for fact extraction and VERification. arXiv preprint arXiv:1803.05355(2018).","DOI":"10.18653\/v1\/N18-1074"},{"key":"e_1_3_2_1_107_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.330"},{"key":"e_1_3_2_1_108_1","unstructured":"Yao-Hung Hubert Tsai Walter Talbott and Jian Zhang. 2024. Efficient Non-Parametric Uncertainty Quantification for Black-Box Large Language Models and Decision Planning. arXiv preprint arXiv:2402.00251(2024)."},{"key":"e_1_3_2_1_109_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.824"},{"key":"e_1_3_2_1_110_1","doi-asserted-by":"crossref","unstructured":"Roman Vashurin Ekaterina Fadeeva Artem Vazhentsev Lyudmila Rvanova Daniil Vasilev Akim Tsvigun Sergey Petrakov Rui Xing Abdelrahman Sadallah Kirill Grishchenkov Alexander Panchenko Timothy Baldwin Preslav Nakov Maxim Panov and Artem Shelmanov. 2025. Benchmarking Uncertainty Quantification Methods for Large Language Models with LM-Polygraph. Transactions of the Association for Computational Linguistics(2025) 220-248.","DOI":"10.1162\/tacl_a_00737"},{"key":"e_1_3_2_1_111_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_112_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.652"},{"key":"e_1_3_2_1_113_1","first-page":"95238","article-title":"Grokking of implicit reasoning in transformers: A mechanistic journey to the edge of generalization","volume":"37","author":"Wang Boshi","year":"2024","unstructured":"Boshi Wang, Xiang Yue, Yu Su, and Huan Sun. 2024b. Grokking of implicit reasoning in transformers: A mechanistic journey to the edge of generalization. Advances in Neural Information Processing Systems, Vol. 37 (2024), 95238-95265.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_114_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.278"},{"key":"e_1_3_2_1_115_1","first-page":"67758","article-title":"Blob: Bayesian low-rank adaptation by backpropagation for large language models","volume":"37","author":"Wang Yibin","year":"2025","unstructured":"Yibin Wang, Haizhou Shi, Ligong Han, Dimitris Metaxas, and Hao Wang. 2025. Blob: Bayesian low-rank adaptation by backpropagation for large language models. Advances in Neural Information Processing Systems, Vol. 37 (2025), 67758-67794.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_116_1","volume-title":"Heng Tao Shen, and Xiaofeng Zhu","author":"Wang Zhiyuan","year":"2024","unstructured":"Zhiyuan Wang, Jinhao Duan, Lu Cheng, Yue Zhang, Qingni Wang, Xiaoshuang Shi, Kaidi Xu, Heng Tao Shen, and Xiaofeng Zhu. 2024a. ConU: Conformal Uncertainty in Large Language Models with Correctness Coverage Guarantees. In Findings of the Association for Computational Linguistics. 6886-6898."},{"key":"e_1_3_2_1_117_1","unstructured":"Jiaxin Wu Yizhou Yu and Hong-Yu Zhou. 2024. Uncertainty Estimation of Large Language Models in Medical Question Answering. arXiv preprint arXiv:2407.08662(2024)."},{"key":"e_1_3_2_1_118_1","doi-asserted-by":"crossref","unstructured":"Shuo Xing Yuping Wang Peiran Li Ruizheng Bai Yueqi Wang Chan-wei Hu Chengxuan Qian Huaxiu Yao and Zhengzhong Tu. 2025. Re-Align: Aligning Vision Language Models via Retrieval-Augmented Direct Preference Optimization. arXiv preprint arXiv:2502.13146(2025).","DOI":"10.18653\/v1\/2025.emnlp-main.121"},{"key":"e_1_3_2_1_119_1","volume-title":"An Empirical Evaluation of Confidence Elicitation in LLMs. In The Twelfth International Conference on Learning Representations.","author":"Xiong Miao","year":"2024","unstructured":"Miao Xiong, Zhiyuan Hu, Xinyang Lu, YIFEI LI, Jie Fu, Junxian He, and Bryan Hooi. 2024a. Can LLMs Express Their Uncertainty? An Empirical Evaluation of Confidence Elicitation in LLMs. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_120_1","volume-title":"Neurips Safe Generative AI Workshop","author":"Xiong Miao","year":"2024","unstructured":"Miao Xiong, Andrea Santilli, Michael Kirchhof, Adam Golinski, and Sinead Williamson. 2024b. Efficient and effective uncertainty quantification for LLMs. In Neurips Safe Generative AI Workshop 2024."},{"key":"e_1_3_2_1_121_1","volume-title":"Bayesian Low-rank Adaptation for Large Language Models. In The Twelfth International Conference on Learning Representations.","author":"Yang Adam X","unstructured":"Adam X Yang, Maxime Robeyns, Xi Wang, and Laurence Aitchison. [n.d.]. Bayesian Low-rank Adaptation for Large Language Models. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_122_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-naacl.325"},{"key":"e_1_3_2_1_123_1","doi-asserted-by":"crossref","unstructured":"Zhilin Yang Peng Qi Saizheng Zhang Yoshua Bengio William W Cohen Ruslan Salakhutdinov and Christopher D Manning. 2018. HotpotQA: A dataset for diverse explainable multi-hop question answering. arXiv preprint arXiv:1809.09600(2018).","DOI":"10.18653\/v1\/D18-1259"},{"key":"e_1_3_2_1_124_1","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611978520.43"},{"key":"e_1_3_2_1_125_1","volume-title":"Tree of thoughts: Deliberate problem solving with large language models. Advances in neural information processing systems","author":"Yao Shunyu","year":"2023","unstructured":"Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Tom Griffiths, Yuan Cao, and Karthik Narasimhan. 2023. Tree of thoughts: Deliberate problem solving with large language models. Advances in neural information processing systems, Vol. 36 (2023), 11809-11822."},{"key":"e_1_3_2_1_126_1","doi-asserted-by":"crossref","unstructured":"Fanghua Ye Mingming Yang Jianhui Pang Longyue Wang Derek Wong Emine Yilmaz Shuming Shi and Zhaopeng Tu. 2025. Benchmarking llms via uncertainty quantification. Advances in Neural Information Processing Systems(2025) 15356-15385.","DOI":"10.52202\/079017-0491"},{"key":"e_1_3_2_1_127_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i15.29583"},{"key":"e_1_3_2_1_128_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.131"},{"key":"e_1_3_2_1_129_1","volume-title":"International Conference on Machine Learning(2025)","author":"Young Spencer","year":"2025","unstructured":"Spencer Young, Porter Jenkins, Lonchao Da, Jeff Dotson, and Hua Wei. 2025. Flexible heteroscedastic count regression with deep double poisson networks. International Conference on Machine Learning(2025)."},{"key":"e_1_3_2_1_130_1","doi-asserted-by":"publisher","DOI":"10.1145\/3366423.3380126"},{"key":"e_1_3_2_1_131_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1472"},{"key":"e_1_3_2_1_132_1","doi-asserted-by":"crossref","unstructured":"Boxuan Zhang and Ruqi Zhang. 2025. CoT-UQ: Improving Response-wise Uncertainty Quantification in LLMs with Chain-of-Thought. arXiv preprint arXiv:2502.17214(2025).","DOI":"10.18653\/v1\/2025.findings-acl.1339"},{"key":"e_1_3_2_1_133_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.299"},{"key":"e_1_3_2_1_134_1","unstructured":"Ruiyang Zhang Hu Zhang and Zhedong Zheng. 2024d. VL-Uncertainty: Detecting Hallucination in Large Vision-Language Model via Uncertainty Estimation. arXiv preprint arXiv:2411.11919(2024)."},{"key":"e_1_3_2_1_135_1","volume-title":"BERTScore: Evaluating Text Generation with BERT. In International Conference on Learning Representations.","author":"Zhang Tianyi","unstructured":"Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian Q Weinberger, and Yoav Artzi. [n.d.]. BERTScore: Evaluating Text Generation with BERT. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_136_1","first-page":"118632","article-title":"Unveiling the tapestry of consistency in large vision-language models","volume":"37","author":"Zhang Yuan","year":"2024","unstructured":"Yuan Zhang, Tao Huang, Chun-Kai Fan, Hongyuan Dong, Jiawen Li, Jiacong Wang, Kuan Cheng, Shanghang Zhang, Haoyuan Guo, et al., 2024a. Unveiling the tapestry of consistency in large vision-language models. Advances in Neural Information Processing Systems, Vol. 37 (2024), 118632-118653.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_137_1","volume-title":"BadCM: Invisible backdoor attack against cross-modal learning","author":"Zhang Zheng","year":"2024","unstructured":"Zheng Zhang, Xu Yuan, Lei Zhu, Jingkuan Song, and Liqiang Nie. 2024c. BadCM: Invisible backdoor attack against cross-modal learning. IEEE Transactions on Image Processing(2024)."},{"key":"e_1_3_2_1_138_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.artmed.2023.102714"},{"key":"e_1_3_2_1_139_1","volume-title":"The Eleventh International Conference on Learning Representations.","author":"Zhao Yao","year":"2023","unstructured":"Yao Zhao, Mikhail Khalman, Rishabh Joshi, Shashi Narayan, Mohammad Saleh, and Peter J Liu. 2023a. Calibrating Sequence likelihood Improves Conditional Language Generation. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_140_1","unstructured":"Chujie Zheng Hao Zhou Fandong Meng Jie Zhou and Minlie Huang. 2023. Large language models are not robust multiple choice selectors. arXiv preprint arXiv:2309.03882(2023)."},{"key":"e_1_3_2_1_141_1","unstructured":"Zhi Zheng Qian Feng Hang Li Alois Knoll and Jianxiang Feng. 2024. Evaluating uncertainty-based failure detection for closed-loop llm planners. arXiv preprint arXiv:2406.00430(2024)."}],"event":{"name":"KDD '25: The 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Toronto ON Canada","acronym":"KDD '25","sponsor":["SIGKDD ACM Special Interest Group on Knowledge Discovery in Data","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3711896.3736569","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T17:59:47Z","timestamp":1777571987000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3711896.3736569"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,3]]},"references-count":141,"alternative-id":["10.1145\/3711896.3736569","10.1145\/3711896"],"URL":"https:\/\/doi.org\/10.1145\/3711896.3736569","relation":{},"subject":[],"published":{"date-parts":[[2025,8,3]]},"assertion":[{"value":"2025-08-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}