{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T05:55:48Z","timestamp":1778046948711,"version":"3.51.4"},"reference-count":57,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Luxembourg Institute of Science and Technology (LIST) through the Projects \u201cADIALab-MAST\u201d and \u201cLLMs4E;\u201d"},{"name":"LLMs4EU Project is co-funded by the European Union through the Digital Europe Program","award":["101198470"],"award-info":[{"award-number":["101198470"]}]},{"name":"Barcelona Supercomputing Center through the Project \u201cTIFON;\u201d"},{"name":"Centre for the Development of Industrial Technology (CDTI) with the Support of the Spanish Ministry of Science and Innovation","award":["MIG-20232039"],"award-info":[{"award-number":["MIG-20232039"]}]},{"DOI":"10.13039\/501100011033","name":"Agencia Estatal de Investigaci\u00f3n","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100011033","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/access.2025.3646270","type":"journal-article","created":{"date-parts":[[2025,12,19]],"date-time":"2025-12-19T18:59:47Z","timestamp":1766170787000},"page":"214772-214791","source":"Crossref","is-referenced-by-count":2,"title":["Metamorphic Testing for Semantic Invariance in Large Language Models"],"prefix":"10.1109","volume":"13","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8334-4719","authenticated-orcid":false,"given":"J.","family":"De Curt\u00f2","sequence":"first","affiliation":[{"name":"Department of Computer Applications in Science and Engineering, Barcelona Supercomputing Center, Barcelona, Spain"}]},{"given":"I.","family":"De Zarz\u00e0","sequence":"additional","affiliation":[{"name":"Human Centered AI, Data and Software, Luxembourg Institute of Science and Technology, Esch-sur-Alzette, Luxembourg"}]}],"member":"263","reference":[{"key":"ref1","article-title":"GPT-4 technical report","volume-title":"arXiv:2303.08774","author":"Achiam","year":"2023"},{"key":"ref2","volume-title":"Claude 3 Model Card","year":"2024"},{"key":"ref3","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv:2307.09288"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2025.3548451"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1016\/j.lindif.2023.102274"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-023-06924-6"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1038\/s41591-023-02448-8"},{"key":"ref8","article-title":"On the robustness of ChatGPT: An adversarial and out-of-distribution perspective","author":"Wang","year":"2023","journal-title":"arXiv:2302.12095"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3689217.3690621"},{"key":"ref10","article-title":"Large language models can be easily distracted by irrelevant context","author":"Shi","year":"2023","journal-title":"arXiv:2302.00093"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3143561"},{"key":"ref12","volume-title":"Hermes-4-70b","year":"2024"},{"key":"ref13","volume-title":"DeepSeek-R1: Reasoning-Optimized Large Language Model","author":"AI","year":"2024"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-5446"},{"key":"ref15","article-title":"SuperGLUE: A stickier benchmark for general-purpose language understanding systems","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wang"},{"key":"ref16","article-title":"Training verifiers to solve math word problems","author":"Cobbe","year":"2021","journal-title":"arXiv:2110.14168"},{"key":"ref17","article-title":"Measuring mathematical problem solving with the MATH dataset","author":"Hendrycks","year":"2021","journal-title":"arXiv:2103.03874"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.411"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W17-4413"},{"key":"ref20","article-title":"Think you have solved question answering? Try ARC, the AI2 reasoning challenge","author":"Clark","year":"2018","journal-title":"arXiv:1803.05457"},{"key":"ref21","first-page":"2507","article-title":"Learn to explain: Multimodal reasoning via thought chains for science question answering","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Lu"},{"key":"ref22","article-title":"Measuring massive multitask language understanding","author":"Hendrycks","year":"2020","journal-title":"arXiv:2009.03300"},{"key":"ref23","article-title":"Beyond the imitation game: Quantifying and extrapolating the capabilities of language models","volume":"5","author":"Srivastava","year":"2023","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref24","article-title":"Holistic evaluation of language models","author":"Liang","year":"2022","journal-title":"arXiv:2211.09110"},{"key":"ref25","article-title":"Fantastically ordered prompts and where to find them: Overcoming few-shot prompt order sensitivity","author":"Lu","year":"2021","journal-title":"arXiv:2104.08786"},{"key":"ref26","first-page":"12697","article-title":"Calibrate before use: Improving few-shot performance of language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Zhao"},{"key":"ref27","article-title":"Do prompt-based models really understand the meaning of their prompts?","author":"Webson","year":"2021","journal-title":"arXiv:2109.01247"},{"key":"ref28","article-title":"Measuring normative and descriptive biases in language models using census data","author":"Han","year":"2023","journal-title":"arXiv:2304.05764"},{"key":"ref29","article-title":"Universal and transferable adversarial attacks on aligned language models","author":"Zou","year":"2023","journal-title":"arXiv:2307.15043"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.806"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00410"},{"key":"ref32","first-page":"88","article-title":"On the systematicity of probing contextualized word representations: The case of hypernymy in BERT","volume-title":"Proc. 9th Joint Conf. Lexical Comput. Semantics.","author":"Ravichander","year":"2020"},{"key":"ref33","article-title":"Self-consistency improves chain of thought reasoning in language models","author":"Wang","year":"2022","journal-title":"arXiv:2203.11171"},{"key":"ref34","first-page":"24824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wei"},{"key":"ref35","article-title":"Measuring faithfulness in chain-of-thought reasoning","author":"Lanham","year":"2023","journal-title":"arXiv:2307.13702"},{"key":"ref36","article-title":"Language models don\u2019t always say what they think: Unfaithful explanations in chain-of-thought prompting","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Turpin"},{"key":"ref37","volume-title":"Metamorphic Testing: A New Approach for Generating Next Test Cases","author":"Chen","year":"2020"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/MCSE.2018.2880577"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1016\/j.jss.2010.11.920"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/3213846.3213858"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3180155.3180220"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3238147.3238187"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.442"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/64"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3468264.3468569"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref47","first-page":"74","article-title":"ROUGE: A package for automatic evaluation of summaries","volume-title":"Proc. Text Summarization Branches Out","author":"Lin"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d19-1410"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.552"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d15-1075"},{"key":"ref51","article-title":"A broad-coverage challenge corpus for sentence understanding through inference","author":"Williams","year":"2017","journal-title":"arXiv:1704.05426"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.393"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1214\/aoms\/1177730491.MR0022058.Zbl0041.26103"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1080\/01621459.1952.10483441"},{"key":"ref55","first-page":"8","volume-title":"Statistical Power Analysis for the Behavioral Science","author":"Cohen","year":"1988"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/FLLM63129.2024.10852449"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1038\/s41592-019-0686-2"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6287639\/10820123\/11305018.pdf?arnumber=11305018","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,5]],"date-time":"2026-01-05T18:41:22Z","timestamp":1767638482000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11305018\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":57,"URL":"https:\/\/doi.org\/10.1109\/access.2025.3646270","relation":{},"ISSN":["2169-3536"],"issn-type":[{"value":"2169-3536","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]}}}