{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T09:13:08Z","timestamp":1774429988028,"version":"3.50.1"},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T00:00:00Z","timestamp":1774396800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T00:00:00Z","timestamp":1774396800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Autom Softw Eng"],"published-print":{"date-parts":[[2026,12]]},"DOI":"10.1007\/s10515-026-00605-0","type":"journal-article","created":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T06:46:50Z","timestamp":1774421210000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Hallucination detection in LLM code generation: A sampling-based consensus verification approach"],"prefix":"10.1007","volume":"33","author":[{"given":"Taicheng","family":"Huang","sequence":"first","affiliation":[]},{"given":"Zhanhui","family":"Ren","sequence":"additional","affiliation":[]},{"given":"Yuan","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Xiangping","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Yi","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Zibin","family":"Zheng","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,3,25]]},"reference":[{"key":"605_CR1","unstructured":"Agarwal, V., Pei, Y., Alamir, S.: Codemirage: Hallucinations in code generated by large language models. arXiv preprint arXiv:2408.08333 (2024)"},{"key":"605_CR2","unstructured":"Austin, J., Odena, A., Nye, M.: Program synthesis with large language models. arXiv preprint arXiv:2108.07732 (2021)"},{"key":"605_CR3","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1016\/j.ins.2019.01.023","volume":"483","author":"S Bag","year":"2019","unstructured":"Bag, S., Kumar, S.: An efficient recommendation generation using relevant jaccard similarity. Inf. Sci. 483, 53\u201364 (2019)","journal-title":"Inf. Sci."},{"key":"605_CR4","unstructured":"Banerjee, S., Lavie, A.: Meteor: An automatic metric for mt evaluation with improved correlation with human judgments. In: Proceedings of the Acl Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization, pp. 65\u201372 (2005)"},{"key":"605_CR5","unstructured":"Cao, S., Shi, J., Pan, L.: Kqa pro: A dataset with explicit compositional programs for complex question answering over knowledge base. arXiv preprint arXiv:2007.03875 (2020)"},{"key":"605_CR6","unstructured":"Chen, M.: Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374 (2021)"},{"key":"605_CR7","doi-asserted-by":"crossref","unstructured":"Chen X, Hu X, Huang Y, et al.: Deep learning-based software engineering: progress, challenges, and opportunities. Science China Information Sciences68(1):111102 (2025)","DOI":"10.1007\/s11432-023-4127-5"},{"key":"605_CR8","doi-asserted-by":"crossref","unstructured":"Chen X, Xu F, Huang Y, et al.: Jit-smart: A multi-task learning framework for just-in-time defect prediction and localization. Proceedings of the ACM on Software Engineering 1(FSE):1\u201323 (2024)","DOI":"10.1145\/3643727"},{"key":"605_CR9","unstructured":"Cobbe, K., Kosaraju, V., Bavarian, M.: Training verifiers to solve math word problems. arXiv preprint arXiv:2110.14168 (2021)"},{"key":"605_CR10","doi-asserted-by":"crossref","unstructured":"Fan, Z., Gao, X., Mirchev, M.: Automated repair of programs from large language models. In: 2023 IEEE\/ACM 45th International Conference on Software Engineering (ICSE), IEEE, pp. 1469\u20131481 (2023)","DOI":"10.1109\/ICSE48619.2023.00128"},{"issue":"8017","key":"605_CR11","doi-asserted-by":"publisher","first-page":"625","DOI":"10.1038\/s41586-024-07421-0","volume":"630","author":"S Farquhar","year":"2024","unstructured":"Farquhar, S., Kossen, J., Kuhn, L.: Detecting hallucinations in large language models using semantic entropy. Nature 630(8017), 625\u2013630 (2024)","journal-title":"Nature"},{"key":"605_CR12","unstructured":"Grattafiori, A., Dubey, A., Jauhri, A.: The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)"},{"key":"605_CR13","unstructured":"Huang Y, Chen Y, Chen X, et al.: Generative software engineering. arXiv preprint (2024) https:\/\/arxiv.org\/abs\/2403.02583"},{"key":"605_CR14","doi-asserted-by":"crossref","unstructured":"Huang Y, Hu X, Jia N, et al.: Learning code context information to predict comment locations. IEEE Transactions on Reliability 69(1):88\u2013105 (2019)","DOI":"10.1109\/TR.2019.2931725"},{"key":"605_CR15","doi-asserted-by":"crossref","unstructured":"Huang Y, Jia N, Chen X, et al.: Salient-class location: Help developers understand code change in code review. In: Proceedings of the 2018 26th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering, pp 770\u2013774 (2018)","DOI":"10.1145\/3236024.3264841"},{"key":"605_CR16","unstructured":"Huang Y, Zhou Y, Chen X: Comment traps: How defective commentedout code augment defects in ai-assisted code generation. arXiv preprint (2025) https:\/\/arxiv.org\/abs\/2512.20334"},{"key":"605_CR17","doi-asserted-by":"publisher","first-page":"453","DOI":"10.1162\/tacl_a_00276","volume":"7","author":"T Kwiatkowski","year":"2019","unstructured":"Kwiatkowski, T., Palomaki, J., Redfield, O.: Natural questions: a benchmark for question answering research. Trans. Assoc. Computat. Linguist. 7, 453\u2013466 (2019)","journal-title":"Trans. Assoc. Computat. Linguist."},{"key":"605_CR18","doi-asserted-by":"publisher","unstructured":"Latif, A., Azam, F., Anwar, M.: Comparison of leading language parsers \u2013 antlr, javacc, sablecc, tree-sitter, yacc, bison. In: 2023 13th International Conference on Software Technology and Engineering (ICSTE), pp. 7\u201313 (2023). https:\/\/doi.org\/10.1109\/ICSTE61649.2023.00009","DOI":"10.1109\/ICSTE61649.2023.00009"},{"key":"605_CR19","unstructured":"Lauscher, A., Glava, G.: How much do llms hallucinate across languages? on multilingual estimation of llm hallucination in the wild. arXiv preprint arXiv:2502.12769 (2025)"},{"key":"605_CR20","first-page":"57619","volume":"37","author":"J Li","year":"2024","unstructured":"Li, J., Li, G., Zhang, X.: Evocodebench: An evolving code generation benchmark with domain-specific evaluations. Adv. Neural. Inf. Process. Syst. 37, 57619\u201357641 (2024)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"605_CR21","unstructured":"Liu, F., Liu, Y., Shi, L.: Exploring and evaluating hallucinations in llm-powered code generation. arXiv preprint arXiv:2404.00971 (2024a)"},{"key":"605_CR22","unstructured":"Liu, H., Xue, W., Chen, Y.: A survey on hallucination in large vision-language models. arXiv preprint arXiv:2402.00253 (2024b)"},{"key":"605_CR23","doi-asserted-by":"publisher","first-page":"21558","DOI":"10.52202\/075280-0943","volume":"36","author":"J Liu","year":"2023","unstructured":"Liu, J., Xia, C.S., Wang, Y.: Is your code generated by chatgpt really correct? rigorous evaluation of large language models for code generation. Adv. Neural. Inf. Process. Syst. 36, 21558\u201321572 (2023a)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"605_CR24","doi-asserted-by":"publisher","first-page":"21558","DOI":"10.52202\/075280-0943","volume":"36","author":"J Liu","year":"2023","unstructured":"Liu, J., Xia, C.S., Wang, Y.: Is your code generated by chatgpt really correct? rigorous evaluation of large language models for code generation. Adv. Neural. Inf. Process. Syst. 36, 21558\u201321572 (2023b)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"605_CR25","doi-asserted-by":"crossref","unstructured":"Maharaj, K., Munigala, V., Tamilselvam, S.: Etf: An entity tracing framework for hallucination detection in code summaries. In: Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 30639\u201330652 (2025)","DOI":"10.18653\/v1\/2025.acl-long.1480"},{"key":"605_CR26","doi-asserted-by":"crossref","unstructured":"Manakul, P., Liusie, A., Gales, M.: Selfcheckgpt: Zero-resource black-box hallucination detection for generative large language models. In: Proceedings of the 2023 Conference On Empirical Methods in Natural Language Processing, pp. 9004\u20139017 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.557"},{"key":"605_CR27","first-page":"44102","volume":"37","author":"M McDermott","year":"2024","unstructured":"McDermott, M., Zhang, H., Hansen, L.: A closer look at auroc and auprc under class imbalance. Adv. Neural. Inf. Process. Syst. 37, 44102\u201344163 (2024)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"605_CR28","doi-asserted-by":"crossref","unstructured":"Nam, D., Macvean, A., Hellendoorn, V.: Using an llm to help with code understanding. In: Proceedings of the IEEE\/ACM 46th International Conference on Software Engineering, pp. 1\u201313 (2024)","DOI":"10.1145\/3597503.3639187"},{"key":"605_CR29","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"605_CR30","doi-asserted-by":"crossref","unstructured":"Rapp, S., Maedche, A., Jainta, M.: Exploring the effects of consistency-based hallucination detection for llm-based qa chatbots: A simulation study. In: International Symposium on Chatbots and Human-Centered AI, Springer, pp. 33\u201347 (2024)","DOI":"10.1007\/978-3-031-88045-2_3"},{"key":"605_CR31","unstructured":"Ravi, S.S., Mielczarek, B., Kannappan, A.: Lynx: An open source hallucination evaluation model. arXiv preprint arXiv:2407.08488 (2024)"},{"key":"605_CR32","unstructured":"Ren, S., Guo, D., Lu, S.: Codebleu: a method for automatic evaluation of code synthesis. arXiv preprint arXiv:2009.10297 (2020)"},{"key":"605_CR33","doi-asserted-by":"crossref","unstructured":"Sarkar, S.: Auto-generated ai code hallucinations: Detection, impact, and mitigation strategies. Impact, and Mitigation Strategies (October 15, 2025) (2025)","DOI":"10.2139\/ssrn.5610993"},{"key":"605_CR34","doi-asserted-by":"crossref","unstructured":"Sedgwick, P.: Spearman\u2019s rank correlation coefficient. Bmj 349 (2014)","DOI":"10.1136\/bmj.g7327"},{"key":"605_CR35","unstructured":"Spracklen, J., Wijewickrama, R., Sakib, A.: We have a package for you! a comprehensive analysis of package hallucinations by code generating llms. arXiv preprint arXiv:2406.10279 (2024)"},{"key":"605_CR36","doi-asserted-by":"crossref","unstructured":"Sun, Y., Yin, Z., Guo, Q.: Benchmarking hallucination in large language models based on unanswerable math word problem. arXiv preprint arXiv:2403.03558 (2024)","DOI":"10.63317\/3jovt56oiu3g"},{"key":"605_CR37","doi-asserted-by":"crossref","unstructured":"Tian, Y., Yan, W., Yang, Q.: Codehalu: Investigating code hallucinations in llms via execution-based verification. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 25300\u201325308 (2025)","DOI":"10.1609\/aaai.v39i24.34717"},{"key":"605_CR38","doi-asserted-by":"crossref","unstructured":"Vaithilingam, P., Zhang, T., Glassman, E.L.: Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models. In: Chi Conference On Human Factors In Computing Systems Extended Abstracts, pp. 1\u20137 (2022)","DOI":"10.1145\/3491101.3519665"},{"key":"605_CR39","doi-asserted-by":"publisher","unstructured":"Wang, J., Chen, Y.: A review on code generation with llms: Application and evaluation. In: 2023 IEEE International Conference on Medical Artificial Intelligence (MedAI), pp. 284\u2013289 (2023). https:\/\/doi.org\/10.1109\/MedAI59581.2023.00044","DOI":"10.1109\/MedAI59581.2023.00044"},{"issue":"10","key":"605_CR40","doi-asserted-by":"publisher","first-page":"2683","DOI":"10.23940\/ijpe.19.10.p14.26832691","volume":"15","author":"W Wen","year":"2019","unstructured":"Wen, W., Xue, X., Li, Y.: Code similarity detection using ast and textual information. Intern. J. Performabil. Eng. 15(10), 2683 (2019)","journal-title":"Intern. J. Performabil. Eng."},{"key":"605_CR41","unstructured":"Yang, A., Li, A., Yang, B.: Qwen3 technical report. arXiv preprint arXiv:2505.09388 (2025)"},{"issue":"6","key":"605_CR42","doi-asserted-by":"publisher","first-page":"1091","DOI":"10.1109\/TPAMI.2007.1078","volume":"29","author":"L Yujian","year":"2007","unstructured":"Yujian, L., Bo, L.: A normalized levenshtein distance metric. IEEE Trans. Pattern Anal. Mach. Intell. 29(6), 1091\u20131095 (2007)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"605_CR43","doi-asserted-by":"crossref","unstructured":"Zeng, Z., Xie, Y., Zhang, H.: Meacap: Memory-augmented zero-shot image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14100\u201314110 (2024)","DOI":"10.1109\/CVPR52733.2024.01337"},{"key":"605_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, S., Pan, L., Zhao, J.: The knowledge alignment problem: Bridging human and external knowledge for large language models. arXiv preprint arXiv:2305.13669 (2023)","DOI":"10.18653\/v1\/2024.findings-acl.121"},{"key":"605_CR45","unstructured":"Zhang, M., Press, O., Merrill, W., et al.: How language model hallucinations can snowball. In: Forty-first International Conference on Machine Learning (2024). https:\/\/openreview.net\/forum?id=FPlaQyAGHu"}],"container-title":["Automated Software Engineering"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10515-026-00605-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10515-026-00605-0","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10515-026-00605-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T06:47:05Z","timestamp":1774421225000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10515-026-00605-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,25]]},"references-count":45,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2026,12]]}},"alternative-id":["605"],"URL":"https:\/\/doi.org\/10.1007\/s10515-026-00605-0","relation":{},"ISSN":["0928-8910","1573-7535"],"issn-type":[{"value":"0928-8910","type":"print"},{"value":"1573-7535","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,3,25]]},"assertion":[{"value":"19 August 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 February 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 March 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"70"}}