{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:28:40Z","timestamp":1776889720506,"version":"3.51.2"},"reference-count":121,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","license":[{"start":{"date-parts":[[2025,2,1]],"date-time":"2025-02-01T00:00:00Z","timestamp":1738368000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,2,1]],"date-time":"2025-02-01T00:00:00Z","timestamp":1738368000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,2,1]],"date-time":"2025-02-01T00:00:00Z","timestamp":1738368000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100002241","name":"JST CRONOS","doi-asserted-by":"publisher","award":["JPMJCS24K8"],"award-info":[{"award-number":["JPMJCS24K8"]}],"id":[{"id":"10.13039\/501100002241","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100020959","name":"JST-Mirai Program","doi-asserted-by":"publisher","award":["JPMJMI20B8"],"award-info":[{"award-number":["JPMJMI20B8"]}],"id":[{"id":"10.13039\/501100020959","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001691","name":"JSPS KAKENHI","doi-asserted-by":"publisher","award":["JP21H04877"],"award-info":[{"award-number":["JP21H04877"]}],"id":[{"id":"10.13039\/501100001691","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001691","name":"JSPS KAKENHI","doi-asserted-by":"publisher","award":["JP23H03372"],"award-info":[{"award-number":["JP23H03372"]}],"id":[{"id":"10.13039\/501100001691","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001691","name":"JSPS KAKENHI","doi-asserted-by":"publisher","award":["JP24K02920"],"award-info":[{"award-number":["JP24K02920"]}],"id":[{"id":"10.13039\/501100001691","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IIEEE Trans. Software Eng."],"published-print":{"date-parts":[[2025,2]]},"DOI":"10.1109\/tse.2024.3519464","type":"journal-article","created":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T19:28:09Z","timestamp":1735759689000},"page":"413-429","source":"Crossref","is-referenced-by-count":38,"title":["<i>Look Before You Leap:<\/i> An Exploratory Study of Uncertainty Analysis for Large Language Models"],"prefix":"10.1109","volume":"51","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3666-4020","authenticated-orcid":false,"given":"Yuheng","family":"Huang","sequence":"first","affiliation":[{"name":"The University of Tokyo, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7093-9781","authenticated-orcid":false,"given":"Jiayang","family":"Song","sequence":"additional","affiliation":[{"name":"University of Alberta, Edmonton, AB, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4559-5426","authenticated-orcid":false,"given":"Zhijie","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Alberta, Edmonton, AB, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5035-2206","authenticated-orcid":false,"given":"Shengming","family":"Zhao","sequence":"additional","affiliation":[{"name":"University of Alberta, Edmonton, AB, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5678-472X","authenticated-orcid":false,"given":"Huaming","family":"Chen","sequence":"additional","affiliation":[{"name":"The University of Sydney, Sydney, NSW, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0857-8611","authenticated-orcid":false,"given":"Felix","family":"Juefei-Xu","sequence":"additional","affiliation":[{"name":"New York University, New York, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8621-2420","authenticated-orcid":false,"given":"Lei","family":"Ma","sequence":"additional","affiliation":[{"name":"The University of Tokyo, Tokyo, Japan"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i12.26752"},{"key":"ref2","article-title":"Code llama: Open foundation models for code","author":"Rozi\u00e8re","year":"2023"},{"key":"ref3","article-title":"ChatGPT","year":"2023"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/s11280-024-01291-2"},{"key":"ref5","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref6","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref7","article-title":"The llama ecosystem: Past, present, and future","year":"2023"},{"issue":"12","key":"ref8","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3571730","article-title":"Survey of hallucination in natural language generation","volume":"55","author":"Ji","year":"2023","journal-title":"ACM Comput. Surveys"},{"key":"ref9","article-title":"Understanding the capabilities, limitations, and societal impact of large language models","author":"Tamkin","year":"2021"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3461702.3462624"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.ijcnlp-main.45"},{"key":"ref12","doi-asserted-by":"crossref","DOI":"10.21203\/rs.3.rs-2566942\/v1","article-title":"Assessing the accuracy and reliability of AI-generated medical responses: An evaluation of the chat-GPT model","volume-title":"Res. Square","author":"Johnson","year":"2023"},{"key":"ref13","article-title":"Sparks of artificial general intelligence: Early experiments with GPT-4","author":"Bubeck","year":"2023"},{"key":"ref14","article-title":"Towards best practices in AGI safety and governance: A survey of expert opinion","author":"Schuett","year":"2023"},{"key":"ref15","article-title":"Frontier AI regulation: Managing emerging risks to public safety","author":"Anderljung","year":"2023"},{"key":"ref16","article-title":"Responsible AI standard, v2,\u201d Microsoft. 2022","year":"2023"},{"key":"ref17","article-title":"Our approach to ai safety","year":"2023"},{"key":"ref18","article-title":"Transform responsible ai from theory into practice","year":"2023"},{"key":"ref19","article-title":"Responsible ai practices","year":"2023"},{"key":"ref20","article-title":"Risk assessment at agi companies: A review of popular risk assessment techniques from other safety-critical industries","author":"Koessler","year":"2023"},{"key":"ref21","article-title":"Facebook\u2019s five pillars of responsible AI","year":"2021"},{"key":"ref22","article-title":"Responsible AI","year":"2023"},{"key":"ref23","article-title":"Building generative AI features responsibly","year":"2023"},{"key":"ref24","article-title":"CAPTUM: A unified and generic model interpretability library for pytorch","author":"Kokhlikyan","year":"2020"},{"key":"ref25","first-page":"344","article-title":"The hateful memes challenge: Competition report","volume-title":"Proc. NeurIPS Competition Demonstration Track","author":"Kiela","year":"2021"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3461702.3462571"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-021-05946-3"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1016\/j.scitotenv.2019.06.320"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.557"},{"key":"ref30","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Ouyang","year":"2022"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3695988"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE-FoSE59343.2023.00008"},{"key":"ref33","article-title":"INCODER: A generative model for code infilling and synthesis","volume-title":"Proc. 11th Int. Conf. Learn. Representations","author":"Fried","year":"2023"},{"key":"ref34","article-title":"Santacoder: Don\u2019T reach for the stars!","author":"Allal"},{"key":"ref35","article-title":"CODEGEN: An open large language model for code with multi-turn program synthesis","author":"Nijkamp","year":"2022"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3611643.3617850"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00194"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00085"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00119"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.247"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3611643.3616243"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3611643.3616271"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3597503.3608134"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/3377811.3380368"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE.2019.00108"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1002\/stvr.1840"},{"key":"ref47","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Brown","year":"2020"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1994.389282"},{"key":"ref49","first-page":"858","article-title":"Large language models in machine translation","volume-title":"Proc. Joint Conf. Empirical Methods in Natural Lang. Process. Comput. Natural Lang. Learn. (EMNLP-CoNLL)","author":"Brants","year":"2007"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2010-343"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1031"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1387"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6479"},{"key":"ref55","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","volume-title":"Proc. NAACL-HLT","author":"Kenton","year":"2019"},{"key":"ref56","article-title":"ROBERTA: A robustly optimized bert pretraining approach","author":"Liu","year":"2019"},{"key":"ref57","article-title":"GraphcodeBERT: Pre-training code representations with data flow","author":"Guo","year":"2020"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.685"},{"issue":"8","key":"ref60","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1145\/3672459"},{"key":"ref62","article-title":"Keep the conversation going: Fixing 162 out of 337 bugs for $0.42 each using chatGPT","author":"Xia","year":"2023"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1089\/big.2016.0051"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-010-5188-5"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1057\/s41599-020-0501-9"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02117-4"},{"key":"ref67","first-page":"1396","article-title":"Failing loudly: An empirical study of methods for detecting dataset shift","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Rabanser","year":"2019"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1016\/j.compchemeng.2019.03.034"},{"key":"ref69","article-title":"Pitfalls of in-domain uncertainty estimation and ensembling in deep learning","author":"Ashukha","year":"2020"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01096"},{"key":"ref71","first-page":"5580","article-title":"What uncertainties do we need in bayesian deep learning for computer vision?","volume-title":"Proc. Adv. Neural Inform. Process. Syst.","volume":"30","author":"Kendall","year":"2017"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-023-10562-9"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-99978-4_9"},{"key":"ref74","first-page":"6405","article-title":"Simple and scalable predictive uncertainty estimation using deep ensembles","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"30","author":"Lakshminarayanan","year":"2017"},{"key":"ref75","first-page":"215","article-title":"Ensemble learning in bayesian neural networks","volume":"168","author":"Barber","year":"1998","journal-title":"Nato ASI Ser. F Comput. Syst. Sci."},{"key":"ref76","first-page":"1308","article-title":"Greedy policy search: A simple baseline for learnable test-time augmentation","volume-title":"Proc. Conf. Uncertainty Artif. Intell.","author":"Lyzhov","year":"2020"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.5555\/3045390.3045502"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2019.01.103"},{"key":"ref79","article-title":"A survey of safety and trustworthiness of large language models through the lens of verification and validation","author":"Huang","year":"2023"},{"key":"ref80","article-title":"How robust is GPT-3.5 to predecessors? A comprehensive study on language understanding tasks","author":"Chen","year":"2023"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.991"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.236"},{"key":"ref83","article-title":"Uncertainty estimation in autoregressive structured prediction","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Malinin"},{"key":"ref84","article-title":"Semantic uncertainty: Linguistic invariances for uncertainty estimation in natural language generation","author":"Kuhn","year":"2023"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-demo.30"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.3019893"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W19-4808"},{"key":"ref88","article-title":"GPT 3.5","year":"2023"},{"key":"ref89","article-title":"A baseline for detecting misclassified and out-of-distribution examples in neural networks","volume-title":"Proc. 5th Int. Conf. Learn. Representations (ICLR)","author":"Hendrycks","year":"2017"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.835"},{"key":"ref91","article-title":"Uncertainty in deep learning","author":"Gal","year":"2016"},{"key":"ref92","article-title":"An exploratory study of ai system risk assessment from the lens of data distribution and uncertainty","author":"Wang","year":"2022"},{"key":"ref93","first-page":"311","article-title":"BLEU: A method for automatic evaluation of machine translation","volume-title":"Proc. 40th Annu. Meeting Assoc. Comput. Linguistics","author":"Papineni","year":"2002"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00536"},{"key":"ref95","article-title":"The llama 3 herd of models","author":"Dubey","year":"2024"},{"key":"ref96","article-title":"GEMMA 2: Improving open language models at a practical size","author":"Team","year":"2024"},{"key":"ref97","article-title":"Phi-3 technical report: A highly capable language model locally on your phone","author":"Abdin","year":"2024"},{"key":"ref98","article-title":"Qwen technical report","volume-title":"2023","author":"Bai"},{"key":"ref99","article-title":"DeepSeek-Coder: When the large language model meets programming\u2013the rise of code intelligence","author":"Guo","year":"2024"},{"key":"ref100","article-title":"Starcoder 2 and the stack v2: The next generation","author":"Lozhkov","year":"2024"},{"key":"ref101","article-title":"GPT-4o","year":"2024"},{"key":"ref102","article-title":"GPT-4o mini","year":"2024"},{"key":"ref103","first-page":"2013","article-title":"WikiQA: A challenge dataset for open-domain question answering","volume-title":"Proc. Conf. Empirical Methods Natural Lang. Process.","author":"Yang","year":"2015"},{"key":"ref104","first-page":"74","article-title":"ROUGE: A package for automatic evaluation of summaries","volume-title":"Text Summarization Branches Out","author":"Lin","year":"2004"},{"key":"ref105","article-title":"CODEBLEU: A method for automatic evaluation of code synthesis","author":"Ren","year":"2020"},{"key":"ref106","first-page":"3558","article-title":"ELI5-Category: A categorized open-domain QA dataset","volume-title":"Proc. 57th Ann. Meeting Assoc. for Comput. Linguistics","author":"Gao","year":"2019"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/K16-1028"},{"key":"ref108","doi-asserted-by":"crossref","first-page":"12","DOI":"10.3115\/v1\/W14-3302","article-title":"Findings of the 2014 workshop on statistical machine translation","volume-title":"Proc. 9th Workshop Statist. Mach. Transl.","author":"Bojar","year":"2014"},{"key":"ref109","article-title":"Evaluating large language models trained on code","author":"Chen","year":"2021"},{"key":"ref110","article-title":"Program synthesis with large language models","author":"Austin","year":"2021"},{"key":"ref111","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/D19-1410","article-title":"Sentence-BERT: Sentence embeddings using siamese bert-networks","volume-title":"Proc. Conf. Empirical Methods Natural Lang. Process.","author":"Reimers","year":"2019"},{"key":"ref112","article-title":"Teaching large language models to self-debug","author":"Chen","year":"2023"},{"key":"ref113","article-title":"Training verifiers to solve math word problems","author":"Cobbe","year":"2021"},{"key":"ref114","article-title":"all-mpnet-base-v2","year":"2023"},{"key":"ref115","article-title":"A theory for emergence of complex skills in language models","author":"Arora","year":"2023"},{"key":"ref116","doi-asserted-by":"crossref","first-page":"4902","DOI":"10.18653\/v1\/2020.acl-main.442","article-title":"Beyond accuracy: Behavioral testing of NLP models with CheckList","volume-title":"Proc. 58th Annu. Meeting Assoc. Comput. Linguistics","author":"Ribeiro","year":"2020"},{"key":"ref117","doi-asserted-by":"crossref","first-page":"1014","DOI":"10.18653\/v1\/2023.wmt-1.97","article-title":"Automating behavioral testing in machine translation","volume-title":"Proc. Eighth Conf. Mach. Transl.","author":"Ferrando","year":"2023"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1145\/3672455"},{"key":"ref119","article-title":"Evaluating quantized large language models","author":"Li","year":"2024"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00129"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1145\/3597503.3639226"}],"container-title":["IEEE Transactions on Software Engineering"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/32\/10885779\/10820047.pdf?arnumber=10820047","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,20]],"date-time":"2025-02-20T20:18:01Z","timestamp":1740082681000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10820047\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2]]},"references-count":121,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/tse.2024.3519464","relation":{},"ISSN":["0098-5589","1939-3520","2326-3881"],"issn-type":[{"value":"0098-5589","type":"print"},{"value":"1939-3520","type":"electronic"},{"value":"2326-3881","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2]]}}}