{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,23]],"date-time":"2026-06-23T08:49:34Z","timestamp":1782204574253,"version":"3.54.5"},"reference-count":119,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T00:00:00Z","timestamp":1776988800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T00:00:00Z","timestamp":1776988800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"NSERC Discovery Grant","award":["RGPIN-2025-04654"],"award-info":[{"award-number":["RGPIN-2025-04654"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Empir Software Eng"],"published-print":{"date-parts":[[2026,9]]},"DOI":"10.1007\/s10664-026-10857-9","type":"journal-article","created":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T03:22:38Z","timestamp":1777000958000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["An empirical study of testing practices in open source AI agent frameworks and agentic applications"],"prefix":"10.1007","volume":"31","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9837-0998","authenticated-orcid":false,"given":"Mohammed Mehedi","family":"Hasan","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4468-5972","authenticated-orcid":false,"given":"Hao","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5024-4868","authenticated-orcid":false,"given":"Emad","family":"Fallahzadeh","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1812-5365","authenticated-orcid":false,"given":"Gopi Krishnan","family":"Rajbahadur","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7213-4006","authenticated-orcid":false,"given":"Bram","family":"Adams","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7749-5513","authenticated-orcid":false,"given":"Ahmed E.","family":"Hassan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2026,4,24]]},"reference":[{"issue":"3","key":"10857_CR1","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s10664-025-10631-3","volume":"30","author":"A Ajibode","year":"2025","unstructured":"Ajibode A, Bangash AA, Cogo FR, Adams B, Hassan AE (2025) Towards semantic versioning of open pre-trained language model releases on hugging face. Empir Softw Eng 30(3):1\u201363","journal-title":"Empir Softw Eng"},{"key":"10857_CR2","doi-asserted-by":"crossref","unstructured":"At\u0131l B, Aykent S, Chittams A, Fu L, Passonneau RJ, Radcliffe E, Rajagopal GR, Sloan A, Tudrej T, T\u00fcre F et\u00a0al (2025) Non-determinism of \u201cdeterministic\u201d llm system settings in hosted environments. In: Proceedings of the 5th workshop on evaluation and comparison of NLP systems, pp 135\u2013148","DOI":"10.18653\/v1\/2025.eval4nlp-1.12"},{"key":"10857_CR3","doi-asserted-by":"crossref","unstructured":"Bang F (2023) Gptcache: An open-source semantic cache for llm applications enabling faster answers and cost savings. In: Proceedings of the 3rd Workshop for Natural Language Processing Open Source Software (NLP-OSS 2023), pp 212\u2013218","DOI":"10.18653\/v1\/2023.nlposs-1.24"},{"issue":"4","key":"10857_CR4","doi-asserted-by":"publisher","first-page":"243","DOI":"10.1007\/s11334-014-0231-5","volume":"10","author":"AS Barb","year":"2014","unstructured":"Barb AS, Neill CJ, Sangwan RS, Piovoso MJ (2014) A statistical study of the relevance of lines of code measures in software projects. Innovations Syst Softw Eng 10(4):243\u2013260","journal-title":"Innovations Syst Softw Eng"},{"key":"10857_CR5","doi-asserted-by":"crossref","unstructured":"Barron RC, Grantcharov V, Wanna S, Eren ME, Bhattarai M, Solovyev N, Tompkins G, Nicholas C, Rasmussen K\u00d8, Matuszek C et al (2024) Domain-specific retrieval-augmented generation using vector stores, knowledge graphs, and tensor factorization. In: 2024 International Conference on Machine Learning and Applications (ICMLA). IEEE, pp 1669\u20131676","DOI":"10.1109\/ICMLA61862.2024.00258"},{"key":"10857_CR6","unstructured":"Bhargava A, Witkowski C, Detkov A, Thomson M (2024) Prompt baking. arXiv preprint arXiv:2409.13697"},{"key":"10857_CR7","doi-asserted-by":"crossref","unstructured":"Bitew SK, Deleu J, Develder C, Demeester T (2023) Distractor generation for multiple-choice questions with predictive prompting and large language models. In: Joint european conference on machine learning and knowledge discovery in databases. Springer, pp 48\u201363","DOI":"10.1007\/978-3-031-74627-7_4"},{"key":"10857_CR8","doi-asserted-by":"crossref","unstructured":"Bodea A (2022) Pytest-smell: a smell detection tool for python unit tests. In: Proceedings of the 31st ACM SIGSOFT international symposium on software testing and analysis, pp 793\u2013796","DOI":"10.1145\/3533767.3543290"},{"key":"10857_CR9","doi-asserted-by":"crossref","unstructured":"Boissier O, Bordini RH, Hubner J, Ricci A (2020) Multi-agent oriented programming: programming multi-agent systems using JaCaMo. Mit Press","DOI":"10.1017\/S026988891800005X"},{"key":"10857_CR10","doi-asserted-by":"crossref","unstructured":"Chandrasekaran J, Cody T, McCarthy N, Lanus E, Freeman L (2023) Test & evaluation best practices for machine learning-enabled systems. arXiv preprint arXiv:2310.06800","DOI":"10.2139\/ssrn.4862589"},{"issue":"2","key":"10857_CR11","first-page":"76","volume":"10","author":"K Chaokromthong","year":"2021","unstructured":"Chaokromthong K, Sintao N et al (2021) Sample size estimation using yamane and cochran and krejcie and morgan and green formulas and cohen statistical power analysis by g* power and comparisions. Apheit international journal of interdisciplinary social sciences and technology 10(2):76\u201386","journal-title":"Apheit international journal of interdisciplinary social sciences and technology"},{"key":"10857_CR12","unstructured":"Cheng Y, Zhang C, Zhang Z, Meng X, Hong S, Li W, Wang Z, Wang Z, Yin F, Zhao J et\u00a0al (2024) Exploring large language model based intelligent agents: Definitions, methods, and prospects. arXiv preprint arXiv:2401.03428"},{"key":"10857_CR13","unstructured":"Confident A (2024) Deepeval"},{"issue":"2","key":"10857_CR14","doi-asserted-by":"publisher","first-page":"397","DOI":"10.1108\/JD-06-2018-0091","volume":"75","author":"LY Conrad","year":"2019","unstructured":"Conrad LY, Tucker VM (2019) Making it tangible: hybrid card sorting within qualitative interviews. J Document 75(2):397\u2013416","journal-title":"J Document"},{"key":"10857_CR15","doi-asserted-by":"crossref","unstructured":"Cruciani E, Miranda B, Verdecchia R, Bertolino A (2019) Scalable approaches for test suite reduction. In: 2019 IEEE\/ACM 41st International Conference on Software Engineering (ICSE). IEEE, pp 419\u2013429","DOI":"10.1109\/ICSE.2019.00055"},{"key":"10857_CR16","doi-asserted-by":"crossref","unstructured":"Cui B, Li J, Guo T, Wang J, Ma D (2010) Code comparison system based on abstract syntax tree. In: 2010 3rd IEEE International Conference on Broadband Network and Multimedia Technology (IC-BNMT). IEEE, pp 668\u2013673","DOI":"10.1109\/ICBNMT.2010.5705174"},{"key":"10857_CR17","doi-asserted-by":"crossref","unstructured":"Daka E, Fraser G (2014) A survey on unit testing practices and problems. In: 2014 IEEE 25th international symposium on software reliability engineering. IEEE 2014:201\u2013211","DOI":"10.1109\/ISSRE.2014.11"},{"key":"10857_CR18","doi-asserted-by":"crossref","unstructured":"Dobslaw F, Feldt R, Yoon J, Yoo S (2025) Challenges in testing large language model based software: A faceted taxonomy. arXiv preprint arXiv:2503.00481","DOI":"10.1145\/3806396"},{"key":"10857_CR19","doi-asserted-by":"crossref","unstructured":"Es S, James J, Anke LE, Schockaert S (2024) Ragas: Automated evaluation of retrieval augmented generation. In: Proceedings of the 18th conference of the european chapter of the association for computational linguistics: system demonstrations, pp 150\u2013158","DOI":"10.18653\/v1\/2024.eacl-demo.16"},{"issue":"4","key":"10857_CR20","doi-asserted-by":"publisher","first-page":"e1845","DOI":"10.1002\/stvr.1845","volume":"33","author":"A Fontes","year":"2023","unstructured":"Fontes A, Gay G (2023) The integration of machine learning into automated test generation: A systematic mapping study. Softw Test Verif Reliab 33(4):e1845","journal-title":"Softw Test Verif Reliab"},{"key":"10857_CR21","doi-asserted-by":"crossref","unstructured":"Fujita S, Kashiwa Y, Lin B, Iida H (2023) An empirical study on the use of snapshot testing. In: 2023 IEEE International Conference on Software Maintenance and Evolution (ICSME). IEEE, pp 335\u2013340","DOI":"10.1109\/ICSME58846.2023.00041"},{"key":"10857_CR22","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1016\/j.infsof.2018.12.003","volume":"108","author":"V Garousi","year":"2019","unstructured":"Garousi V, Felderer M, K\u0131l\u0131\u00e7aslan FN (2019) A survey on software testability. Inf Softw Technol 108:35\u201364","journal-title":"Inf Softw Technol"},{"key":"10857_CR23","unstructured":"Gevers I, De\u00a0Marez V, Van\u00a0Nooten J, Lemmens J, Kosar A, Lotfi E, Banar N, Fivez P, De\u00a0Bruyne L, Daelemans W (2025) In benchmarks we trust... or not?. In: Proceedings of the 2025 conference on empirical methods in natural language processing, pp. 23\u00a0673\u201323\u00a0687"},{"key":"10857_CR24","doi-asserted-by":"crossref","unstructured":"Gonzalez D, Santos JC, Popovich A, Mirakhorli M, Nagappan M (2017) A large-scale study on the usage of testing patterns that address maintainability attributes: patterns for ease of modification, diagnoses, and comprehension. In: 2017 IEEE\/ACM 14th International Conference on Mining Software Repositories (MSR). IEEE, pp 391\u2013401","DOI":"10.1109\/MSR.2017.8"},{"key":"10857_CR25","doi-asserted-by":"crossref","unstructured":"Gonzalez D, Zimmermann T, Nagappan N (2020) The state of the ml-universe: 10 years of artificial intelligence & machine learning software development on github. In: Proceedings of the 17th International conference on mining software repositories, pp 431\u2013442","DOI":"10.1145\/3379597.3387473"},{"key":"10857_CR26","doi-asserted-by":"crossref","unstructured":"Gueron S, Johnson S, Walker J (2011) Sha-512\/256. In: 2011 8th international conference on information technology: new generations. IEEE, pp 354\u2013358","DOI":"10.1109\/ITNG.2011.69"},{"key":"10857_CR27","doi-asserted-by":"crossref","unstructured":"Han J, Deng S, Xia X, Wang D, Yin J (2019) Characterization and prediction of popular projects on github. In: 2019 IEEE 43rd annual computer software and applications conference (COMPSAC), vol 1. IEEE 2019:21\u201326","DOI":"10.1109\/COMPSAC.2019.00013"},{"key":"10857_CR28","doi-asserted-by":"crossref","unstructured":"H\u00e4ndler T (2023) A taxonomy for autonomous llm-powered multi-agent architectures. In: KMIS, pp 85\u201398","DOI":"10.5220\/0012239100003598"},{"key":"10857_CR29","doi-asserted-by":"crossref","unstructured":"Hasan MM, Li H, Fallahzadeh E, Rajbahadur GK, Adams B, Hassan AE (2025) Model context protocol (mcp) at first glance: Studying the security and maintainability of mcp servers. [Online]. Available: arXiv:2506.13538","DOI":"10.1145\/3814959"},{"key":"10857_CR30","doi-asserted-by":"crossref","unstructured":"Hassan MM, Rahman A (2022) As code testing: Characterizing test quality in open source ansible development. In: 2022 IEEE Conference on Software Testing, Verification and Validation (ICST). IEEE, pp 208\u2013219","DOI":"10.1109\/ICST53961.2022.00031"},{"key":"10857_CR31","doi-asserted-by":"crossref","unstructured":"Hassan AE, Lin D, Rajbahadur GK, Gallaba K, Cogo FR, Chen B, Zhang H, Thangarajah K, Oliva G, Lin J et\u00a0al (2024) Rethinking software engineering in the era of foundation models: A curated catalogue of challenges in the development of trustworthy fmware. In: Companion proceedings of the 32nd ACM international conference on the foundations of software engineering, pp 294\u2013305","DOI":"10.1145\/3663529.3663849"},{"key":"10857_CR32","doi-asserted-by":"crossref","unstructured":"Hettiarachchi I (2025) Exploring generative ai agents: Architecture, applications, and challenges. J Artif Intell General Sci (JAIGS) ISSN: 3006-4023 8(1):105\u2013127","DOI":"10.60087\/jaigs.v8i1.350"},{"key":"10857_CR33","unstructured":"Hexmoor H, Lammens J, Caicedo G, Shapiro SC (2026) Behaviour based AI, cognitive processes, and emergent behaviors in autonomous agents. WIT Press, vol 1"},{"key":"10857_CR34","doi-asserted-by":"crossref","unstructured":"Huang K, Hughes C (2025) Agentic ai communication protocols and security. In: Securing AI Agents: Foundations, Frameworks, and Real-World Deployment. Springer, pp 81\u2013110","DOI":"10.1007\/978-3-032-02130-4_4"},{"key":"10857_CR35","unstructured":"Huang D, Chew S, Dutkiewicz A, Wang Z (2025) Llm-as-a-judge for scalable test coverage evaluation: Accuracy, operational reliability, and cost. arXiv preprint arXiv:2512.01232"},{"key":"10857_CR36","unstructured":"Huyen C (2025a) Common pitfalls when building generative ai applications. [Online]. Available: https:\/\/huyenchip.com\/2025\/01\/16\/ai-engineering-pitfalls.html. Accessed 13 March 2026"},{"key":"10857_CR37","unstructured":"Huyen C (2025b) How to evaluate ai that\u2019s smarter than us. [Online]. Available: https:\/\/queue.acm.org\/detail.cfm?id=3722043. Accessed 06 Jan 2026"},{"key":"10857_CR38","volume-title":"Building Applications with Foundation Models","author":"C Huyen","year":"2024","unstructured":"Huyen C, Engineering AI (2024) Building Applications with Foundation Models. O\u2019Reilly Media, Incorporated"},{"key":"10857_CR39","doi-asserted-by":"crossref","unstructured":"Jebnoun H, Ben\u00a0Braiek H, Rahman MM, Khomh F (2020) The scent of deep learning code: An empirical study. In: Proceedings of the 17th international conference on mining software repositories, pp 420\u2013430","DOI":"10.1145\/3379597.3387479"},{"key":"10857_CR40","doi-asserted-by":"crossref","unstructured":"Jiang N-J, Marneffe M-C\u00a0d (2022) Investigating reasons for disagreement in natural language inference. Trans Assoc Comput Linguis 10:1357\u20131374","DOI":"10.1162\/tacl_a_00523"},{"key":"10857_CR41","doi-asserted-by":"crossref","unstructured":"Kalliamvakou E, Gousios G, Blincoe K, Singer L, German DM, Damian D (2014) The promises and perils of mining github. In: Proceedings of the 11th working conference on mining software repositories, pp 92\u2013101","DOI":"10.1145\/2597073.2597074"},{"key":"10857_CR42","doi-asserted-by":"crossref","unstructured":"Kampmann A, Zeller A (2019) Carving parameterized unit tests. In: 2019 IEEE\/ACM 41st international conference on software engineering: companion proceedings (ICSE-Companion). IEEE, pp 248\u2013249","DOI":"10.1109\/ICSE-Companion.2019.00098"},{"issue":"2","key":"10857_CR43","doi-asserted-by":"publisher","first-page":"152","DOI":"10.5395\/rde.2017.42.2.152","volume":"42","author":"H-Y Kim","year":"2017","unstructured":"Kim H-Y (2017) Statistical notes for clinical researchers: Chi-squared test and fisher\u2019s exact test. Restorative dentistry & endodontics 42(2):152","journal-title":"Restorative dentistry & endodontics"},{"issue":"6","key":"10857_CR44","doi-asserted-by":"publisher","first-page":"2971","DOI":"10.3390\/app15062971","volume":"15","author":"S Kim","year":"2025","unstructured":"Kim S, Oh D (2025) Evaluating creativity: can llms be good evaluators in creative writing tasks? Appl Sci 15(6):2971","journal-title":"Appl Sci"},{"key":"10857_CR45","unstructured":"Kokane S, Zhu M, Awalgaonkar TM, Zhang J, Prabhakar A, Hoang TQ, Liu Z, RN R, Yang L, Yao W et\u00a0al (2025) Toolscan: A benchmark for characterizing errors in tool-use llms. In: ICLR 2025 workshop on building trust in language models and applications"},{"issue":"1","key":"10857_CR46","doi-asserted-by":"publisher","DOI":"10.1016\/j.infoandorg.2025.100560","volume":"35","author":"S Krakowski","year":"2025","unstructured":"Krakowski S (2025) Human-ai agency in the age of generative ai. Inf Organ 35(1):100560","journal-title":"Inf Organ"},{"key":"10857_CR47","doi-asserted-by":"crossref","unstructured":"Labuschagne A, Inozemtseva L, Holmes R (2017) Measuring the cost of regression testing in practice: A study of java projects using continuous integration. In: Proceedings of the 2017 11th joint meeting on foundations of software engineering, pp 821\u2013830","DOI":"10.1145\/3106237.3106288"},{"key":"10857_CR48","unstructured":"Lam W, Srisakaokul S, Bassett B, Mahdian P, Xie T, Lakshman P, De Halleux J (2018) A characteristic study of parameterized unit tests in. net open source projects. In: 32nd European Conference on Object-Oriented Programming (ECOOP 2018). Schloss Dagstuhl\u2013Leibniz-Zentrum f\u00fcr Informatik, pp 5\u20131"},{"key":"10857_CR49","doi-asserted-by":"crossref","unstructured":"Landis JR, Koch GG (1977) The measurement of observer agreement for categorical data. Biometrics, pp 159\u2013174","DOI":"10.2307\/2529310"},{"key":"10857_CR50","first-page":"9459","volume":"33","author":"P Lewis","year":"2020","unstructured":"Lewis P, Perez E, Piktus A, Petroni F, Karpukhin V, Goyal N, K\u00fcttler H, Lewis M, Yih W-T, Rockt\u00e4schel T et al (2020) Retrieval-augmented generation for knowledge-intensive nlp tasks. Adv Neural Inf Process Syst 33:9459\u20139474","journal-title":"Adv Neural Inf Process Syst"},{"issue":"1","key":"10857_CR51","doi-asserted-by":"publisher","first-page":"6","DOI":"10.1007\/s10664-024-10570-5","volume":"30","author":"H Li","year":"2025","unstructured":"Li H, Bezemer C-P (2025) Bridging the language gap: an empirical study of bindings for open source machine learning libraries across software package ecosystems. Empir Softw Eng 30(1):6","journal-title":"Empir Softw Eng"},{"key":"10857_CR52","doi-asserted-by":"crossref","unstructured":"Li G, Hammoud H, Itani H, Khizbullin D, Ghanem B (2023) Camel: Communicative agents for\u201d mind\u201d exploration of large language model society. Adv Neural Inf Process Syst 36:51\u00a0991\u201352\u00a0008","DOI":"10.52202\/075280-2264"},{"issue":"1","key":"10857_CR53","doi-asserted-by":"publisher","first-page":"9","DOI":"10.1007\/s44336-024-00009-2","volume":"1","author":"X Li","year":"2024","unstructured":"Li X, Wang S, Zeng S, Wu Y, Yang Y (2024) A survey on llm-based multi-agent systems: workflow, infrastructure, and challenges. Vicinagearth 1(1):9","journal-title":"Vicinagearth"},{"key":"10857_CR54","doi-asserted-by":"crossref","unstructured":"Liu Y, Iter D, Xu Y, Wang S, Xu R, Zhu C (2023a) G-eval: Nlg evaluation using gpt-4 with better human alignment. In: Proceedings of the 2023 conference on empirical methods in natural language processing, pp 2511\u20132522","DOI":"10.18653\/v1\/2023.emnlp-main.153"},{"key":"10857_CR55","unstructured":"Liu X, Yu H, Zhang H, Xu Y, Lei X, Lai H, Gu Y, Ding H, Men K, Yang K et\u00a0al (2023b) Agentbench: Evaluating llms as agents. arXiv preprint arXiv:2308.03688"},{"key":"10857_CR56","doi-asserted-by":"crossref","unstructured":"Liu Y, Lo SK, Lu Q, Zhu L, Zhao D, Xu X, Harrer S, Whittle J (2025) Agent design pattern catalogue: A collection of architectural patterns for foundation model based agents. J Syst Softw 220:112278","DOI":"10.1016\/j.jss.2024.112278"},{"issue":"5","key":"10857_CR57","doi-asserted-by":"publisher","first-page":"1077","DOI":"10.1007\/s10270-024-01207-8","volume":"23","author":"R Lukyanenko","year":"2024","unstructured":"Lukyanenko R, Samuel BM, Parsons J, Storey VC, Pastor O, Jabbari A (2024) Universal conceptual modeling: principles, benefits, and an agenda for conceptual modeling research. Softw Syst Model 23(5):1077\u20131100","journal-title":"Softw Syst Model"},{"key":"10857_CR58","unstructured":"Marchal M, Scholman M, Yung F, Demberg V (2022) Establishing annotation quality in multi-label annotations. In: Proceedings of the 29th international conference on computational linguistics, pp 3659\u20133668"},{"key":"10857_CR59","doi-asserted-by":"crossref","unstructured":"Marvin G, Hellen N, Jjingo D, Nakatumba-Nabende J (2023) Prompt engineering in large language models. In: International conference on data intelligence and cognitive informatics. Springer, pp 387\u2013402","DOI":"10.1007\/978-981-99-7962-2_30"},{"key":"10857_CR60","unstructured":"Masterman T, Besen S, Sawtell M, Chao A (2024) The landscape of emerging ai agent architectures for reasoning, planning, and tool calling: A survey. arXiv preprint arXiv:2404.11584"},{"key":"10857_CR61","doi-asserted-by":"crossref","unstructured":"Ma W, Yang C, K\u00e4stner C (2024) (why) is my prompt getting worse? rethinking regression testing for evolving llm apis. In: Proceedings of the IEEE\/ACM 3rd international conference on ai engineering-software engineering for AI, pp 166\u2013171","DOI":"10.1145\/3644815.3644950"},{"key":"10857_CR62","unstructured":"Ma W, Yang Y, Hu Q, Ying S, Jin Z, Du B, Xing Z, Li T, Shi J, Liu Y et\u00a0al (2025) Rethinking testing for llm applications: Characteristics, challenges, and a lightweight interaction protocol. arXiv preprint arXiv:2508.20737"},{"key":"10857_CR63","unstructured":"Meszaros G (2007) xUnit test patterns: Refactoring test code. Pearson Education"},{"key":"10857_CR64","doi-asserted-by":"crossref","unstructured":"Meszaros G, Smith SM, Andrea J (2003) The test automation manifesto. In: Conference on extreme programming and agile methods. Springer, pp 73\u201381","DOI":"10.1007\/978-3-540-45122-8_9"},{"key":"10857_CR65","unstructured":"Mialon G, Fourrier C, Wolf T, LeCun Y, Scialom T (2023) Gaia: a benchmark for general ai assistants. In: The 12th international conference on learning representations"},{"key":"10857_CR66","doi-asserted-by":"crossref","unstructured":"Mohammadi M, Li Y, Lo J, Yip W (2025) Evaluation and benchmarking of llm agents: A survey. In: Proceedings of the 31st ACM SIGKDD conference on knowledge discovery and data mining V 2:6129\u20136139","DOI":"10.1145\/3711896.3736570"},{"key":"10857_CR67","doi-asserted-by":"publisher","first-page":"3219","DOI":"10.1007\/s10664-017-9512-6","volume":"22","author":"N Munaiah","year":"2017","unstructured":"Munaiah N, Kroh S, Cabrey C, Nagappan M (2017) Curating github for engineered software projects. Empir Softw Eng 22:3219\u20133253","journal-title":"Empir Softw Eng"},{"key":"10857_CR68","unstructured":"Myers GJ (2006) The art of software testing. John Wiley & Sons"},{"key":"10857_CR69","doi-asserted-by":"crossref","unstructured":"Nascimento N, Alencar P, Cowan D (2023) Self-adaptive large language model (llm)-based multiagent systems. In: 2023 IEEE International Conference on Autonomic Computing and Self-Organizing Systems Companion (ACSOSC). IEEE, pp 104\u2013109","DOI":"10.1109\/ACSOS-C58168.2023.00048"},{"key":"10857_CR70","doi-asserted-by":"crossref","unstructured":"Nayebi M, Kuznetsov K, Chen P, Zeller A, Ruhe G (2018) Anatomy of functionality deletion: an exploratory study on mobile apps. In: Proceedings of the 15th international conference on mining software repositories, pp 243\u2013253","DOI":"10.1145\/3196398.3196410"},{"key":"10857_CR71","doi-asserted-by":"crossref","unstructured":"Nejadgholi M, Yang J (2019) A study of oracle approximations in testing deep learning libraries. In: 2019 34th IEEE\/ACM International Conference on Automated Software Engineering (ASE). IEEE, pp 785\u2013796","DOI":"10.1109\/ASE.2019.00078"},{"key":"10857_CR72","doi-asserted-by":"crossref","unstructured":"Niedermayr R, Juergens E, Wagner S (2016) Will my tests tell me if i break this code?. In: Proceedings of the international workshop on continuous software evolution and delivery, pp 23\u201329","DOI":"10.1145\/2896941.2896944"},{"key":"10857_CR73","doi-asserted-by":"crossref","unstructured":"Nishi Y, Masuda S, Ogawa H, Uetsuki K (2018) A test architecture for machine learning product. In: 2018 IEEE International Conference on Software Testing, Verification and Validation Workshops (ICSTW). IEEE, pp 273\u2013278","DOI":"10.1109\/ICSTW.2018.00060"},{"key":"10857_CR74","unstructured":"Okken B (2022) Python Testing with pytest. Pragmatic Bookshelf"},{"key":"10857_CR75","doi-asserted-by":"crossref","unstructured":"Openja M, Khomh F, Foundjem A, Jiang ZM, Abidi M, Hassan AE (2024) An empirical study of testing machine learning in the wild. ACM Trans Softw Eng Methodol","DOI":"10.1145\/3680463"},{"key":"10857_CR76","unstructured":"Pan MZ, Arabzadeh N, Cogo R, Zhu Y, Xiong A, Agrawal LA, Mao H, Shen E, Pallerla S, Patel L et\u00a0al (2025) Measuring agents in production. arXiv preprint arXiv:2512.04123"},{"key":"10857_CR77","doi-asserted-by":"crossref","unstructured":"Park JS, O\u2019Brien J, Cai CJ, Morris MR, Liang P, Bernstein MS (2023) Generative agents: Interactive simulacra of human behavior. In: Proceedings of the 36th annual acm symposium on user interface software and technology, pp 1\u201322","DOI":"10.1145\/3586183.3606763"},{"key":"10857_CR78","unstructured":"Parker MJ, Anderson C, Stone C, Oh Y (2024) A large language model approach to educational survey feedback analysis. Int J Artif Intell Educ, pp 1\u201338"},{"key":"10857_CR79","doi-asserted-by":"crossref","unstructured":"Passonneau R (2006) Measuring agreement on set-valued items (MASI) for semantic and pragmatic annotation. In: Proceedings of the 5th International Conference on Language Resources and Evaluation (LREC\u201906). Genoa, Italy: European Language Resources Association (ELRA). [Online]. Available: https:\/\/aclanthology.org\/L06-1392\/","DOI":"10.63317\/4nuo6thi27ax"},{"key":"10857_CR80","doi-asserted-by":"crossref","unstructured":"Patil R, Boit S, Gudivada V, Nandigam J (2023) A survey of text representation and embedding techniques in nlp. IEEE Access, 11:36 120\u201336 146","DOI":"10.1109\/ACCESS.2023.3266377"},{"key":"10857_CR81","doi-asserted-by":"crossref","unstructured":"Pei K, Cao Y, Yang J, Jana S (2017) Deepxplore: Automated whitebox testing of deep learning systems. In: proceedings of the 26th symposium on operating systems principles, pp 1\u201318","DOI":"10.1145\/3132747.3132785"},{"key":"10857_CR82","unstructured":"Rafi MN, Kim DJ, Chen T-H, Wang S (2026) Order matters! an empirical study on large language models\u2019 input order bias in software fault localization. In: Proceedings of the 48th IEEE\/ACM International Conference on Software Engineering (ICSE), to appear"},{"key":"10857_CR83","unstructured":"Rajbahadur GK, Oliva GA, Lin D, Hassan AE (2024) From cool demos to production-ready fmware: Core challenges and a technology roadmap. arXiv preprint arXiv:2410.20791"},{"key":"10857_CR84","unstructured":"Rao AS, Georgeff MP et\u00a0al (1995) Bdi agents: from theory to practice. In: Icmas 95:312\u2013319"},{"key":"10857_CR85","doi-asserted-by":"crossref","unstructured":"Razavi A, Soltangheis M, Arabzadeh N, Salamat S, Zihayat M, Bagheri E (2025) Benchmarking prompt sensitivity in large language models. In: European conference on information retrieval. Springer, pp 303\u2013 313","DOI":"10.1007\/978-3-031-88714-7_29"},{"key":"10857_CR86","first-page":"1831","volume":"34","author":"R Sahoo","year":"2021","unstructured":"Sahoo R, Zhao S, Chen A, Ermon S (2021) Reliable decisions with threshold calibration. Adv Neural Inf Process Syst 34:1831\u20131844","journal-title":"Adv Neural Inf Process Syst"},{"key":"10857_CR87","doi-asserted-by":"crossref","unstructured":"Schmidt CW, Reddy V, Zhang H, Alameddine A, Uzan O, Pinter Y, Tanner C (2024) Tokenization is more than compression. In: Proceedings of the 2024 conference on empirical methods in natural language processing, pp 678\u2013702","DOI":"10.18653\/v1\/2024.emnlp-main.40"},{"key":"10857_CR88","doi-asserted-by":"crossref","unstructured":"Shen Y, Song K, Tan X, Li D, Lu W, Zhuang Y (2024) Hugginggpt: Solving ai tasks with chatgpt and its friends in hugging face. Adv Neural Inf Process Syst, vol 36","DOI":"10.52202\/075280-1657"},{"key":"10857_CR89","unstructured":"Spencer D (2009) Card sorting: Designing usable categories. Rosenfeld Media"},{"key":"10857_CR90","doi-asserted-by":"crossref","unstructured":"Spirin E, Bogomolov E, Kovalenko V, Bryksin T (2021) Psiminer: A tool for mining rich abstract syntax trees from code. In: 2021 IEEE\/ACM 18th International Conference on Mining Software Repositories (MSR). IEEE, pp 13\u201317","DOI":"10.1109\/MSR52588.2021.00014"},{"key":"10857_CR91","doi-asserted-by":"crossref","unstructured":"Tao Y (2009) An introduction to assertion-based verification. In: 2009 IEEE 8th international conference on ASIC. IEEE, pp 1318\u20131323","DOI":"10.1109\/ASICON.2009.5351246"},{"issue":"1","key":"10857_CR92","doi-asserted-by":"publisher","first-page":"77","DOI":"10.3102\/10769986027001077","volume":"27","author":"D Thissen","year":"2002","unstructured":"Thissen D, Steinberg L, Kuang D (2002) Quick and easy implementation of the benjamini-hochberg procedure for controlling the false positive rate in multiple comparisons. J Educ Behav Stat 27(1):77\u201383","journal-title":"J Educ Behav Stat"},{"issue":"5","key":"10857_CR93","doi-asserted-by":"publisher","first-page":"253","DOI":"10.1145\/1095430.1081749","volume":"30","author":"N Tillmann","year":"2005","unstructured":"Tillmann N, Schulte W (2005) Parameterized unit tests. ACM SIGSOFT Softw Eng Notes 30(5):253\u2013262","journal-title":"ACM SIGSOFT Softw Eng Notes"},{"key":"10857_CR94","doi-asserted-by":"crossref","unstructured":"Tramer F, Atlidakis V, Geambasu R, Hsu D, Hubaux J-P, Humbert M, Juels A, Lin H (2017) Fairtest: Discovering unwarranted associations in data-driven applications. In: IEEE european symposium on security and privacy (EuroS&P). IEEE 2017:401\u2013416","DOI":"10.1109\/EuroSP.2017.29"},{"key":"10857_CR95","doi-asserted-by":"crossref","unstructured":"Van Rompaey B, Demeyer S (2008) Exploring the composition of unit test suites. In: 2008 23rd IEEE\/ACM international conference on automated software engineering-workshops. IEEE, pp 11\u201320","DOI":"10.1109\/ASEW.2008.4686316"},{"issue":"9","key":"10857_CR96","first-page":"1857","volume":"47","author":"Z Wan","year":"2019","unstructured":"Wan Z, Xia X, Lo D, Murphy GC (2019) How does machine learning change software development practices? IEEE Trans Software Eng 47(9):1857\u20131871","journal-title":"IEEE Trans Software Eng"},{"key":"10857_CR97","unstructured":"Wang X, Wang Z, Liu J, Chen Y, Yuan L, Peng H, Ji H (2023) Mint: Evaluating llms in multi-turn interaction with tools and language feedback. arXiv preprint arXiv:2309.10691"},{"issue":"4","key":"10857_CR98","doi-asserted-by":"publisher","first-page":"911","DOI":"10.1109\/TSE.2024.3368208","volume":"50","author":"J Wang","year":"2024","unstructured":"Wang J, Huang Y, Chen C, Liu Z, Wang S, Wang Q (2024) Software testing with large language models: Survey, landscape, and vision. IEEE Trans Software Eng 50(4):911\u2013936","journal-title":"IEEE Trans Software Eng"},{"key":"10857_CR99","doi-asserted-by":"crossref","unstructured":"Wei J, Wang X, Schuurmans D, Bosma M, Xia F, Chi E, Le QV, Zhou D et\u00a0al (2022a) Chain-of-thought prompting elicits reasoning in large language models. Adv Neural Iinf Process Syst 35:24\u00a0824\u201324\u00a0837","DOI":"10.52202\/068431-1800"},{"key":"10857_CR100","doi-asserted-by":"crossref","unstructured":"Wei C, Xiao L, Yu T, Chen X, Wang X, Wong S, Clune A (2022b) Automatically tagging the \u201caaa\u201d pattern in unit test cases using machine learning models. In: Proceedings of the 37th IEEE\/ACM international conference on automated software engineering, pp 1\u20133","DOI":"10.1145\/3551349.3559510"},{"key":"10857_CR101","doi-asserted-by":"crossref","unstructured":"Wei C, Xiao L, Yu T, Wong S, Clune A (2025) How do developers structure unit test cases? an empirical analysis of the aaa pattern in open source projects. IEEE Trans Softw Eng","DOI":"10.1109\/TSE.2025.3537337"},{"key":"10857_CR102","unstructured":"Whyte G, Mulder DL (2011) Mitigating the impact of software test constraints on software testing effectiveness. Electr J Inf Syst Evaluat 14(2):pp254\u2013270"},{"key":"10857_CR103","unstructured":"Wu Q, Bansal G, Zhang J, Wu Y, Li B, Zhu E, Jiang L, Zhang X, Zhang S, Liu J et\u00a0al (2024a) Autogen: Enabling next-gen llm applications via multi-agent conversations. In: 1st Conference on Language Modeling"},{"key":"10857_CR104","doi-asserted-by":"crossref","unstructured":"Wu Z, Lin X, Dai Z, Hu W, Shu Y, Ng S-K, Jaillet P, Low BKH (2024b) Prompt optimization with ease? efficient ordering-aware automated selection of exemplars. Adv Neural Inf Process Syst, 37:122\u00a0706\u2013122\u00a0740","DOI":"10.52202\/079017-3899"},{"key":"10857_CR105","doi-asserted-by":"crossref","unstructured":"Xu FF, Alon U, Neubig G, Hellendoorn VJ (2022) A systematic evaluation of large language models of code. In: Proceedings of the 6th ACM SIGPLAN international symposium on machine programming, pp 1\u201310","DOI":"10.1145\/3520312.3534862"},{"issue":"2","key":"10857_CR106","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1002\/stv.430","volume":"22","author":"S Yoo","year":"2012","unstructured":"Yoo S, Harman M (2012) Regression testing minimization, selection and prioritization: a survey. Softw Test Verif Reliab 22(2):67\u2013120","journal-title":"Softw Test Verif Reliab"},{"key":"10857_CR107","doi-asserted-by":"publisher","DOI":"10.1016\/j.jss.2022.111425","volume":"192","author":"F Zampetti","year":"2022","unstructured":"Zampetti F, Kapur R, Di Penta M, Panichella S (2022) An empirical characterization of software bugs in open-source cyber-physical systems. J Syst Softw 192:111425","journal-title":"J Syst Softw"},{"issue":"4","key":"10857_CR108","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1109\/TSE.2022.3217544","volume":"49","author":"L Zamprogno","year":"2022","unstructured":"Zamprogno L, Hall B, Holmes R, Atlee JM (2022) Dynamic human-in-the-loop assertion generation. IEEE Trans Software Eng 49(4):2337\u20132351","journal-title":"IEEE Trans Software Eng"},{"key":"10857_CR109","doi-asserted-by":"crossref","unstructured":"Zhang Y, Mesbah A (2015) Assertions are strongly correlated with test suite effectiveness. In: Proceedings of the 2015 10th joint meeting on foundations of software engineering, pp 214\u2013224","DOI":"10.1145\/2786805.2786858"},{"issue":"1","key":"10857_CR110","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TSE.2019.2962027","volume":"48","author":"JM Zhang","year":"2020","unstructured":"Zhang JM, Harman M, Ma L, Liu Y (2020) Machine learning testing: Survey, landscapes and horizons. IEEE Trans Software Eng 48(1):1\u201336","journal-title":"IEEE Trans Software Eng"},{"key":"10857_CR111","doi-asserted-by":"crossref","unstructured":"Zhang Z, Dai Q, Bo X, Ma C, Li R, Chen X, Zhu J, Dong Z, Wen J-R (2025a) A survey on the memory mechanism of large language model-based agents. ACM Trans Inf Syst 43(6):1\u201347","DOI":"10.1145\/3748302"},{"key":"10857_CR112","unstructured":"Zhang Q, Wornow M, Olukotun K (2025b) Cost-efficient serving of llm agents via test-time plan caching. In: ES-FoMo III: 3rd workshop on efficient systems for foundation models"},{"issue":"4","key":"10857_CR113","doi-asserted-by":"publisher","first-page":"98","DOI":"10.1007\/s10664-024-10474-4","volume":"29","author":"Z Zhao","year":"2024","unstructured":"Zhao Z, Chen Y, Bangash AA, Adams B, Hassan AE (2024) An empirical study of challenges in machine learning asset management. Empir Softw Eng 29(4):98","journal-title":"Empir Softw Eng"},{"key":"10857_CR114","doi-asserted-by":"crossref","unstructured":"Zheng L, Chiang W-L, Sheng Y, Zhuang S, Wu Z, Zhuang Y, Lin Z, Li Z, Li D, Xing E et\u00a0al (2023) Judging llm-as-a-judge with mt-bench and chatbot arena. Adv Neural Inf Process Syst, 36:46\u00a0595\u201346\u00a0623","DOI":"10.52202\/075280-2020"},{"key":"10857_CR115","unstructured":"Zhou S, Xu FF, Zhu H, Zhou X, Lo R, Sridhar A, Cheng X, Ou T, Bisk Y, Fried D et\u00a0al (2023)Webarena: A realistic web environment for building autonomous agents. arXiv preprint arXiv:2307.13854"},{"key":"10857_CR116","doi-asserted-by":"crossref","unstructured":"Zhu H, Wei L, Terragni V, Liu Y, Cheung S-C, Wu J, Sheng Q, Zhang B, Song L (2023) Stubcoder: Automated generation and repair of stub code for mock objects. ACM Trans Softw Eng Methodol 33(1):1\u201331","DOI":"10.1145\/3617171"},{"key":"10857_CR117","doi-asserted-by":"crossref","unstructured":"Zhu H, Terragni V, Wei L, Cheung S-C, Wu J, Liu Y (2025) Understanding and characterizing mock assertions in unit tests. In: Proceedings of the ACM on software engineering, vol 2, no FSE, pp 554\u2013575","DOI":"10.1145\/3715741"},{"key":"10857_CR118","doi-asserted-by":"crossref","unstructured":"Zhuo J, Zhang S, Fang X, Duan H, Lin D, Chen K (2024) Prosa: Assessing and understanding the prompt sensitivity of llms. Findings of the Association for Computational Linguistics EMNLP 2024:1950\u20131976","DOI":"10.18653\/v1\/2024.findings-emnlp.108"},{"key":"10857_CR119","doi-asserted-by":"crossref","unstructured":"Zou W, Zhang W, Xia X, Holmes R, Chen Z (2019) Branch use in practice: A large-scale empirical study of 2,923 projects on github. In: 2019 ieee 19th international conference on software quality, reliability and security (qrs). IEEE 2019:306\u2013317","DOI":"10.1109\/QRS.2019.00047"}],"container-title":["Empirical Software Engineering"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10664-026-10857-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10664-026-10857-9","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10664-026-10857-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,23]],"date-time":"2026-06-23T08:15:28Z","timestamp":1782202528000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10664-026-10857-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,24]]},"references-count":119,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2026,9]]}},"alternative-id":["10857"],"URL":"https:\/\/doi.org\/10.1007\/s10664-026-10857-9","relation":{},"ISSN":["1382-3256","1573-7616"],"issn-type":[{"value":"1382-3256","type":"print"},{"value":"1573-7616","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,4,24]]},"assertion":[{"value":"24 September 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 March 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 April 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declared that they have no known competing interests or personal relationships that could have (appeared to) influenced the work reported in this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of Interest"}},{"value":"This study does not involve human participants or animals.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical Approval"}},{"value":"Not applicable. No human subjects were involved in this study.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Informed Consent"}},{"value":"Not applicable.","order":5,"name":"Ethics","group":{"name":"EthicsHeading","label":"Clinical Trial Number in the Manuscript"}}],"article-number":"124"}}