{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,8]],"date-time":"2026-04-08T09:33:53Z","timestamp":1775640833901,"version":"3.50.1"},"reference-count":255,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"7","license":[{"start":{"date-parts":[[2024,7,1]],"date-time":"2024-07-01T00:00:00Z","timestamp":1719792000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,7,1]],"date-time":"2024-07-01T00:00:00Z","timestamp":1719792000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,7,1]],"date-time":"2024-07-01T00:00:00Z","timestamp":1719792000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100007631","name":"Canada CIFAR AI Chairs Program","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100007631","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100000038","name":"Natural Sciences and Engineering Research Council of Canada","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100000038","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100020959","name":"JST-Mirai Program","doi-asserted-by":"publisher","award":["JPMJMI20B8"],"award-info":[{"award-number":["JPMJMI20B8"]}],"id":[{"id":"10.13039\/501100020959","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001691","name":"JSPS KAKENHI","doi-asserted-by":"publisher","award":["JP21H04877"],"award-info":[{"award-number":["JP21H04877"]}],"id":[{"id":"10.13039\/501100001691","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001691","name":"JSPS KAKENHI","doi-asserted-by":"publisher","award":["JP23H03372"],"award-info":[{"award-number":["JP23H03372"]}],"id":[{"id":"10.13039\/501100001691","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001691","name":"JSPS KAKENHI","doi-asserted-by":"publisher","award":["JP24K02920"],"award-info":[{"award-number":["JP24K02920"]}],"id":[{"id":"10.13039\/501100001691","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100013373","name":"Autoware Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100013373","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IIEEE Trans. Software Eng."],"published-print":{"date-parts":[[2024,7]]},"DOI":"10.1109\/tse.2024.3411928","type":"journal-article","created":{"date-parts":[[2024,6,18]],"date-time":"2024-06-18T17:41:07Z","timestamp":1718732467000},"page":"1921-1948","source":"Crossref","is-referenced-by-count":14,"title":["LUNA: A Model-Based Universal Analysis Framework for Large Language Models"],"prefix":"10.1109","volume":"50","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9267-4229","authenticated-orcid":false,"given":"Da","family":"Song","sequence":"first","affiliation":[{"name":"Department of Electrical and Computer Engineering, University of Alberta, Edmonton, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3981-8515","authenticated-orcid":false,"given":"Xuan","family":"Xie","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, University of Alberta, Edmonton, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7093-9781","authenticated-orcid":false,"given":"Jiayang","family":"Song","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, University of Alberta, Edmonton, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9552-0097","authenticated-orcid":false,"given":"Derui","family":"Zhu","sequence":"additional","affiliation":[{"name":"Department of Computer Science, Technical University of Munich, Munich, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3666-4020","authenticated-orcid":false,"given":"Yuheng","family":"Huang","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, University of Alberta, Edmonton, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0857-8611","authenticated-orcid":false,"given":"Felix","family":"Juefei-Xu","sequence":"additional","affiliation":[{"name":"New York University, New York, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8621-2420","authenticated-orcid":false,"given":"Lei","family":"Ma","sequence":"additional","affiliation":[{"name":"The University of Tokyo, Tokyo, Japan"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/3491101.3519665"},{"key":"ref2","article-title":"Conversational automated program repair","author":"Xia","year":"2023"},{"key":"ref3","article-title":"Sentiment analysis in the era of large language models: A reality check","author":"Zhang","year":"2023"},{"key":"ref4","article-title":"Large language models are human-level prompt engineers","author":"Zhou","year":"2022"},{"key":"ref5","article-title":"ChatGPT","year":"2022"},{"key":"ref6","article-title":"GPT4","year":"2023"},{"key":"ref7","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref8","article-title":"Sparks of artificial general intelligence: Early experiments with GPT-4","author":"Bubeck","year":"2023"},{"key":"ref9","article-title":"DecodingTrust: A comprehensive assessment of trustworthiness in GPT models","author":"Wang","year":"2023"},{"key":"ref10","article-title":"Measuring reliability of large language models through semantic consistency","author":"Raj","year":"2022"},{"key":"ref11","article-title":"InfoBERT: Improving robustness of language models from an information theoretic perspective","author":"Wang","year":"2020"},{"key":"ref12","article-title":"A review on language models as knowledge bases","author":"AlKhamissi","year":"2022"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3571730"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3461702.3462624"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.173"},{"key":"ref16","article-title":"Do as i can, not as i say: Grounding language in robotic affordances","author":"Ahn","year":"2022"},{"key":"ref17","article-title":"ChatCAD: Interactive computer-aided diagnosis on medical image using large language models","author":"Wang","year":"2023"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3132747.3132785"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3238147.3238202"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE.2019.00108"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3293882.3330579"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ISSRE.2018.00021"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3180155.3180220"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3238147.3238187"},{"key":"ref25","first-page":"6618","article-title":"Repairing without retraining: Avoiding disparate impact with counterfactual distributions","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang","year":"2019"},{"key":"ref26","first-page":"1","article-title":"Correcting deep neural networks with small, generalizing patches","volume-title":"Proc. Workshop Saf. Robustness Decis. Making","author":"Sotoudeh","year":"2019"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ASE.2019.00043"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TR.2021.3096332"},{"key":"ref29","first-page":"11383","article-title":"RNNRepair: Automatic RNN repair via model-based analysis","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Xie","year":"2021"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3511598"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3510003.3510232"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/SANER53432.2022.00056"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00104"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3533767.3534386"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3533767.3534394"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3604609"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/3597926.3598045"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/3551349.3556920"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/3533767.3534408"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/3597926.3598109"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3544792"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE-Companion58688.2023.00027"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3551349.3556968"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1016\/j.infsof.2023.107272"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3563210"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICST57152.2023.00030"},{"key":"ref47","doi-asserted-by":"crossref","first-page":"359","DOI":"10.1145\/3377811.3380353","article-title":"Misbehaviour prediction for autonomous driving systems","volume-title":"Proc. ACM\/IEEE 42nd Int. Conf. Softw. Eng.","author":"Stocco","year":"2020"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/3460319.3464825"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ISSRE5003.2020.00035"},{"key":"ref50","first-page":"396","article-title":"Handwritten digit recognition with a back-propagation network","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"2","author":"LeCun","year":"1989"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.21236\/ada164453"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1145\/3460319.3464811"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ISSRE55969.2022.00016"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1145\/3338906.3338954"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00092"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-88885-5_24"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/TSE.2023.3282981"},{"key":"ref58","article-title":"Mosaic: Model-based safety analysis framework for AI-enabled cyber-physical systems","author":"Xie","year":"2023"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1145\/3510003.3510051"},{"key":"ref60","first-page":"499","article-title":"Towards interpreting recurrent neural networks through probabilistic abstraction","volume-title":"Proc. 35th IEEE\/ACM Int. Conf. Automated Softw. Eng.","author":"Dong","year":"2020"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1145\/3585005"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref63","doi-asserted-by":"crossref","DOI":"10.1145\/3695988","article-title":"Large language models for software engineering: A systematic literature review","author":"Hou","year":"2024"},{"key":"ref64","article-title":"A new era in software security: Towards self-healing software via large language models and formal verification","author":"Charalambous","year":"2023"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE-FoSE59343.2023.00010"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE-FoSE59343.2023.00008"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1093\/bib\/bbac409"},{"key":"ref68","article-title":"Galactica: A large language model for science","author":"Taylor","year":"2022"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1148\/radiol.230163"},{"key":"ref70","article-title":"Large language models are state-of-the-art evaluators of translation quality","author":"Kocmi","year":"2023"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1016\/j.lindif.2023.102274"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1145\/219717.219745"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1016\/j.metrad.2023.100017"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2022.3204972"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00632"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1145\/3520312.3534862"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1145\/3551349.3559555"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/72.279181"},{"key":"ref79","first-page":"1310","article-title":"On the difficulty of training recurrent neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Pascanu","year":"2013"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1145\/3485447.3512045"},{"key":"ref81","article-title":"The internal state of an LLM knows when its lying","author":"Azaria","year":"2023"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00045"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1007\/s10115-022-01756-8"},{"key":"ref84","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","volume-title":"Proc. Conf. North Amer. Chapter Assoc. Comput. Linguistics: Human Lang. Technol.","volume":"1","author":"Devlin","year":"2019"},{"key":"ref85","article-title":"DeBERTa: Decoding-enhanced BERT with disentangled attention","author":"He","year":"2020"},{"key":"ref86","article-title":"RoBERTa: A robustly optimized BERT pretraining approach","author":"Liu","year":"2019"},{"key":"ref87","doi-asserted-by":"crossref","first-page":"7871","DOI":"10.18653\/v1\/2020.acl-main.703","article-title":"BART: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension","volume-title":"Proc. 58th Annu. Meeting Assoc. Comput. Linguistics","author":"Lewis","year":"2020"},{"key":"ref88","article-title":"UL2: Unifying language learning paradigms","volume-title":"Proc. 11th Int. Conf. Learn. Representations","author":"Tay","year":"2023"},{"issue":"1","key":"ref89","first-page":"5485","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref90","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Brown","year":"2020"},{"key":"ref91","article-title":"Code Llama: Open foundation models for code","author":"Roziere","year":"2023"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-3020"},{"key":"ref93","article-title":"Latent Jailbreak: A benchmark for evaluating text safety and output robustness of large language models","author":"Qiu","year":"2023"},{"key":"ref94","article-title":"Rain: Your language models can align themselves without finetuning","author":"Li","year":"2023"},{"key":"ref95","article-title":"LLM self defense: By self examination, llms know they are being tricked","author":"Helbling","year":"2023"},{"key":"ref96","article-title":"Concrete problems in AI safety","author":"Amodei","year":"2016"},{"key":"ref97","first-page":"1","article-title":"A baseline for detecting misclassified and out-of-distribution examples in neural networks","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Hendrycks","year":"2017"},{"key":"ref98","article-title":"A simple unified framework for detecting out-of-distribution samples and adversarial attacks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"31","author":"Lee","year":"2018"},{"key":"ref99","article-title":"Enhancing the reliability of out-of-distribution image detection in neural networks","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Liang","year":"2018"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i7.20752"},{"key":"ref101","article-title":"A survey on out-of-distribution detection in NLP","author":"Lang","year":"2023"},{"key":"ref102","first-page":"10687","article-title":"Types of out-of-distribution texts and how to detect them","volume-title":"Proc. Conf. Empirical Methods Natural Lang. Process.","author":"Arora","year":"2021"},{"key":"ref103","first-page":"1","article-title":"Out-of-distribution detection and selective generation for conditional language models","volume-title":"Proc. 11th Int. Conf. Learn. Representations","author":"Ren","year":"2023"},{"key":"ref104","doi-asserted-by":"crossref","first-page":"5684","DOI":"10.18653\/v1\/2020.acl-main.503","article-title":"Selective question answering under domain shift","volume-title":"Proc. 58th Annu. Meeting Assoc. Comput. Linguistics","author":"Kamath","year":"2020"},{"key":"ref105","article-title":"On the robustness of ChatGPT: An adversarial and out-of-distribution perspective","author":"Wang","year":"2023"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2011.06.019"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-40994-3_25"},{"key":"ref108","first-page":"1","article-title":"Intriguing properties of neural networks","volume-title":"Proc. 2nd Int. Conf. Learn. Representations (ICLR)","author":"Szegedy","year":"2014"},{"key":"ref109","article-title":"Explaining and harnessing adversarial examples","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Goodfellow","year":"2015"},{"key":"ref110","article-title":"Adversarial attacks and defences: A survey","author":"Chakraborty","year":"2018"},{"key":"ref111","first-page":"1","article-title":"Adversarial glue: A multi-task benchmark for robustness evaluation of language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wang","year":"2021"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1145\/3593042"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.92"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1437"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W17-3204"},{"key":"ref116","article-title":"Evaluating the factual consistency of abstractive text summarization","author":"Kry\u015aci\u0144ski","year":"2019"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1255"},{"key":"ref118","doi-asserted-by":"crossref","first-page":"831","DOI":"10.18653\/v1\/P19-1080","article-title":"Constrained decoding for neural NLG from compositional representations in task-oriented dialogue","volume-title":"Proc. 57th Annu. Meeting Assoc. Comput. Linguistics","author":"Balakrishnan","year":"2019"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1016\/0893-6080(95)00086-0"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1993.5.6.976"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/SCCC.2003.1245447"},{"key":"ref122","first-page":"423","article-title":"Marble: Model-based robustness analysis of stateful deep learning systems","volume-title":"Proc. 35th IEEE\/ACM Int. Conf. Automated Softw. Eng.","author":"Du","year":"2020"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i13.17391"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/VAST.2017.8585721"},{"key":"ref125","article-title":"Countable-state Markov chains","year":"2011"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2021.3103064"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/ASE51524.2021.9678871"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1145\/3510003.3510080"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-90870-6_5"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-17244-1_22"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1039\/C3AY41907J"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1007\/978-0-387-73003-5_196"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/3477.764879"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1093\/bioinformatics\/14.9.755"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1109\/MASSP.1986.1165342"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1023\/A:1007469218079"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1214\/aoms\/1177697196"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.229"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1109\/ATS47505.2019.000-8"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1007\/11817949_3"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1016\/j.infsof.2022.107117"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.3102\/1076998618822540"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1017\/cbo9780511526237"},{"key":"ref144","first-page":"2633","article-title":"Extracting training data from large language models","volume-title":"Proc. 30th USENIX Secur. Symp. (USENIX Secur. 21)","author":"Carlini","year":"2021"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1109\/TSE.2018.2811489"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1037\/0033-295X.102.4.684"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.557"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1145\/3641289"},{"key":"ref149","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Ouyang","year":"2022"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1145\/3442188.3445922"},{"key":"ref151","article-title":"Alignment of language agents","author":"Kenton","year":"2021"},{"key":"ref152","article-title":"Rome was built in 1776: A case study on factual correctness in knowledge-grounded response generation","author":"Santhanam","year":"2021"},{"key":"ref153","article-title":"The factual inconsistency problem in abstractive text summarization: A survey","author":"Huang","year":"2021"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1016\/S0031-3203(96)00142-2"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.754"},{"key":"ref156","article-title":"Vicuna: An open-source chatbot impressing GPT-4 with 90%* ChatGPT quality","author":"Chiang","year":"2023"},{"key":"ref157","article-title":"Stanford alpaca: An instruction-following LLaMA model","author":"Taori","year":"2023"},{"key":"ref158","article-title":"LLaMA 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref159","article-title":"Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation","author":"Liu","year":"2023"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-024-10824-0"},{"key":"ref161","doi-asserted-by":"crossref","first-page":"6723","DOI":"10.18653\/v1\/2022.acl-long.464","article-title":"A token-level reference-free hallucination detection benchmark for free-form text generation","volume-title":"Proc. 60th Annu. Meeting Assoc. Comput. Linguistics (Volume 1: Long Papers)","author":"Liu","year":"2022"},{"key":"ref162","first-page":"1631","article-title":"Recursive deep models for semantic compositionality over a sentiment treebank","volume-title":"Proc. Conf. Empirical Methods Natural Lang. Process.","author":"Socher","year":"2013"},{"key":"ref163","article-title":"Evaluating large language models trained on code","author":"Chen","year":"2021"},{"key":"ref164","article-title":"Program synthesis with large language models","author":"Austin","year":"2021"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/314"},{"key":"ref166","article-title":"CodeSearchNet challenge: Evaluating the state of semantic code search","author":"Husain","year":"2019"},{"key":"ref167","doi-asserted-by":"publisher","DOI":"10.1145\/3510003.3510060"},{"key":"ref168","article-title":"GPT 3.5","year":"2022"},{"key":"ref169","article-title":"Inference-time intervention: Eliciting truthful answers from a language model","author":"Li","year":"2023"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1214\/aos\/1176348768"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1214\/aoms\/1177730491.MR0022058.Zbl0041.26103"},{"key":"ref172","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1007\/978-3-642-00296-0","article-title":"Pearson correlation coefficient","volume-title":"Noise Reduction in Speech Processing","author":"Cohen","year":"2009"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.2307\/2332226"},{"key":"ref174","doi-asserted-by":"crossref","DOI":"10.4135\/9781412983808","volume-title":"Correlation: Parametric and Nonparametric Measures","author":"Chen","year":"2002"},{"key":"ref175","article-title":"WikiChat: A few-shot LLM-based chatbot grounded with Wikipedia","author":"Semnani","year":"2023"},{"issue":"285","key":"ref176","first-page":"1","article-title":"DeepChecks: A library for testing and validating machine learning models and data","volume":"23","author":"Chorev","year":"2022","journal-title":"J. Mach. Learn. Res."},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.23919\/DATE.2019.8714971"},{"key":"ref178","article-title":"Outside the box: Abstraction-based monitoring of neural networks","author":"Henzinger","year":"2019"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3055015"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE43902.2021.00034"},{"key":"ref181","article-title":"Large language models in fault localisation","author":"Wu","year":"2023"},{"key":"ref182","article-title":"Evaluating the robustness of neural networks: An extreme value theory approach","author":"Weng","year":"2018"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1145\/3238147.3238172"},{"key":"ref184","article-title":"Bias assessment and mitigation in llm-based code generation","author":"Huang","year":"2023"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE-Companion58688.2023.00033"},{"key":"ref186","first-page":"809","article-title":"FEVER: A large-scale dataset for fact extraction and VERification","volume-title":"Proc. Conf. North Amer. Chapter Assoc. Comput. Linguistics: Human Lang. Technol.","volume":"1","author":"Thorne","year":"2018"},{"key":"ref187","first-page":"7856","article-title":"q2: Evaluating factual consistency in knowledge-grounded dialogues via question generation and question answering","volume-title":"Proc. Conf. Empirical Methods Natural Lang. Process.","author":"Honovich","year":"2021"},{"key":"ref188","first-page":"347","article-title":"TextFlint: Unified multilingual robustness evaluation toolkit for natural language processing","volume-title":"Proc. 59th Annu. Meeting Assoc. Comput. Linguistics\/11th Int. Joint Conf. Natural Lang. Process.: System Demonstrations","author":"Wang","year":"2021"},{"key":"ref189","doi-asserted-by":"crossref","first-page":"3356","DOI":"10.18653\/v1\/2020.findings-emnlp.301","article-title":"RealToxicityPrompts: Evaluating neural toxic degeneration in language models","volume-title":"Proc. Findings Assoc. Comput. Linguistics: EMNLP 2020","author":"Gehman","year":"2020"},{"key":"ref190","article-title":"Can ChatGPT-like generative models guarantee factual accuracy? On the mistakes of new generation search engines","author":"Zhao","year":"2023"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1109\/SMC53992.2023.10394237"},{"key":"ref192","article-title":"Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation","author":"Liu","year":"2023"},{"key":"ref193","article-title":"Mathematical capabilities of chatgpt","author":"Frieder","year":"2023"},{"key":"ref194","article-title":"Evaluating the logical reasoning ability of ChatGPT and GPTt-4","author":"Liu","year":"2023"},{"key":"ref195","article-title":"Holistic evaluation of language models","author":"Liang","year":"2022"},{"key":"ref196","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.76"},{"key":"ref197","first-page":"1393","article-title":"Detecting hallucinated content in conditional neural sequence generation","volume-title":"Proc. Findings Assoc. Comput. Linguistics (ACL-IJCNLP)","author":"Zhou","year":"2021"},{"key":"ref198","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i12.26752"},{"key":"ref199","article-title":"Interpretable unified language checking","author":"Zhang","year":"2023"},{"key":"ref200","article-title":"Uncertainty estimation in autoregressive structured prediction","author":"Malinin","year":"2020"},{"key":"ref201","first-page":"2734","article-title":"On hallucination and predictive uncertainty in conditional language generation","volume-title":"Proc. 16th Conf. Eur. Chapter Assoc. Comput. Linguistics: Main Volume","author":"Xiao","year":"2021"},{"key":"ref202","article-title":"Look before you leap: An exploratory study of uncertainty measurement for large language models","author":"Huang","year":"2023"},{"key":"ref203","first-page":"1","article-title":"Semantic uncertainty: Linguistic invariances for uncertainty estimation in natural language generation","volume-title":"Proc. 11th Int. Conf. Learn. Representations","author":"Kuhn","year":"2023"},{"key":"ref204","article-title":"Generating with confidence: Uncertainty quantification for black-box large language models","author":"Lin","year":"2023"},{"key":"ref205","article-title":"Uncertainty in natural language generation: From theory to applications","author":"Baan","year":"2023"},{"key":"ref206","first-page":"1587","article-title":"Toward controlled generation of text","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Hu","year":"2017"},{"key":"ref207","first-page":"34586","article-title":"Factuality enhanced language models for open-ended text generation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Lee","year":"2022"},{"key":"ref208","doi-asserted-by":"crossref","first-page":"401","DOI":"10.18653\/v1\/2022.acl-long.31","article-title":"Mix and match: Learning-free controllable text generationusing energy language models","volume-title":"Proc. 60th Annu. Meeting Assoc. Comput. Linguistics (Volume 1: Long Papers)","author":"Mireshghallah","year":"2022"},{"key":"ref209","article-title":"Systematic rectification of language models via dead-end analysis","author":"Cao","year":"2023"},{"key":"ref210","first-page":"339","article-title":"Learning to repair: Repairing model output errors after deployment using a dynamic memory of feedback","volume-title":"Proc. Findings Assoc. Comput. Linguistics (NAACL)","author":"Tandon","year":"2022"},{"key":"ref211","article-title":"Critic: Large language models can self-correct with tool-interactive critiquing","author":"Gou","year":"2023"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.67"},{"key":"ref213","article-title":"Self-refine: Iterative refinement with self-feedback","author":"Madaan","year":"2023"},{"key":"ref214","article-title":"Teaching large language models to self-debug","author":"Chen","year":"2023"},{"key":"ref215","article-title":"Rethinking with retrieval: Faithful large language model inference","author":"He","year":"2022"},{"key":"ref216","article-title":"Check your facts and try again: Improving large language models with external knowledge and automated feedback","author":"Peng","year":"2023"},{"key":"ref217","article-title":"WebBrain: Learning to generate factually correct articles for queries by grounding on large web corpus","author":"Qian","year":"2023"},{"key":"ref218","article-title":"Replug: Retrieval-augmented black-box language models","author":"Shi","year":"2023"},{"key":"ref219","article-title":"Mitigating language model hallucination with interactive question-knowledge alignment","author":"Zhang","year":"2023"},{"key":"ref220","article-title":"Brain in a vat: On missing pieces towards artificial general intelligence in large language models","author":"Ma","year":"2023"},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.1145\/130385.130432"},{"key":"ref222","doi-asserted-by":"crossref","first-page":"740","DOI":"10.18653\/v1\/P18-2117","article-title":"On the practical computational power of finite precision RNNs for language recognition","volume-title":"Proc. 56th Annu. Meeting Assoc. Comput. Linguistics (Volume 2: Short Papers)","author":"Weiss","year":"2018"},{"key":"ref223","doi-asserted-by":"crossref","first-page":"1","DOI":"10.18653\/v1\/W19-3901","article-title":"Sequential neural networks as automata","volume-title":"Proc. Workshop Deep Learn. Formal Lang. Building Bridges","author":"Merrill","year":"2019"},{"key":"ref224","first-page":"5247","article-title":"Extracting automata from recurrent neural networks using queries and counterexamples","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Weiss","year":"2018"},{"key":"ref225","first-page":"1","article-title":"Learning deterministic weighted automata with queries and counterexamples","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Weiss","year":"2019"},{"key":"ref226","first-page":"81","article-title":"Explaining black boxes on sequential data using weighted automata","volume-title":"Proc. Int. Conf. Grammatical Inference","author":"Ayache","year":"2019"},{"key":"ref227","article-title":"Weighted automata extraction and explanation of recurrent neural networks for natural language tasks","author":"Wei","year":"2023"},{"key":"ref228","article-title":"Extracting finite automata from RNNs using state merging","author":"Merrill","year":"2022"},{"key":"ref229","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539356"},{"key":"ref230","first-page":"1","article-title":"Efficient adversarial sequence generation for RNN with symbolic weighted finite automata,\u201d","author":"Ma","journal-title":"SafeAI@ AAAI"},{"key":"ref231","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580852"},{"key":"ref232","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i8.20801"},{"key":"ref233","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.751"},{"key":"ref234","article-title":"Interpretability in the wild: A circuit for indirect object identification in GPT-2 small","author":"Wang","year":"2022"},{"key":"ref235","first-page":"468","article-title":"Neuro-symbolic language modeling with automaton-augmented retrieval","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Alon","year":"2022"},{"key":"ref236","first-page":"1","article-title":"Validating large language models with RELM","volume-title":"Proc. Mach. Learn. Syst.","volume":"5","author":"Kuchnik","year":"2023"},{"key":"ref237","article-title":"CodeGen: An open large language model for code with multi-turn program synthesis","author":"Nijkamp","year":"2022"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.1145\/3368089.3417058"},{"key":"ref239","doi-asserted-by":"publisher","DOI":"10.1126\/science.abq1158"},{"key":"ref240","article-title":"Self-collaboration code generation via ChatGPT","author":"Dong","year":"2023"},{"key":"ref241","doi-asserted-by":"publisher","DOI":"10.1145\/3551349.3559555"},{"key":"ref242","first-page":"6563","article-title":"Code generation as a dual task of code summarization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Wei","year":"2019"},{"key":"ref243","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.685"},{"key":"ref244","article-title":"Incoder: A generative model for code infilling and synthesis","author":"Fried","year":"2022"},{"key":"ref245","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00119"},{"key":"ref246","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00085"},{"key":"ref247","doi-asserted-by":"publisher","DOI":"10.14722\/ndss.2024.24556"},{"key":"ref248","doi-asserted-by":"publisher","DOI":"10.1145\/3597926.3598067"},{"key":"ref249","doi-asserted-by":"publisher","DOI":"10.1109\/APR59189.2023.00012"},{"key":"ref250","doi-asserted-by":"publisher","DOI":"10.1145\/3524459.3527351"},{"key":"ref251","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00125"},{"key":"ref252","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00129"},{"key":"ref253","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00128"},{"key":"ref254","doi-asserted-by":"publisher","DOI":"10.1145\/3643674"},{"key":"ref255","doi-asserted-by":"publisher","DOI":"10.1109\/tse.2024.3368208\/mm1"}],"container-title":["IEEE Transactions on Software Engineering"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/32\/10601498\/10562221.pdf?arnumber=10562221","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T03:46:18Z","timestamp":1732247178000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10562221\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7]]},"references-count":255,"journal-issue":{"issue":"7"},"URL":"https:\/\/doi.org\/10.1109\/tse.2024.3411928","relation":{},"ISSN":["0098-5589","1939-3520","2326-3881"],"issn-type":[{"value":"0098-5589","type":"print"},{"value":"1939-3520","type":"electronic"},{"value":"2326-3881","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,7]]}}}