{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,30]],"date-time":"2026-05-30T17:01:42Z","timestamp":1780160502403,"version":"3.54.0"},"reference-count":133,"publisher":"Elsevier BV","issue":"8","license":[{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T00:00:00Z","timestamp":1779408000000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100010198","name":"Ministerio de Asuntos Econ\u00f3micos y Transformaci\u00f3n Digital, Gobierno de Espa\u00f1a","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100010198","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003359","name":"Generalitat of Valencia","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003359","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100008530","name":"European Regional Development Fund","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100008530","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100014440","name":"Spain Ministry of Science Innovation and Universities","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100014440","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Information Processing &amp; Management"],"published-print":{"date-parts":[[2026,12]]},"DOI":"10.1016\/j.ipm.2026.104922","type":"journal-article","created":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T11:29:13Z","timestamp":1779449353000},"page":"104922","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"title":["Cross-sectional analysis of large language models in current natural language generation challenges"],"prefix":"10.1016","volume":"63","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7996-4440","authenticated-orcid":false,"given":"Mar\u00eda Mir\u00f3","family":"Maestre","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5684-0083","authenticated-orcid":false,"given":"Iv\u00e1n","family":"Mart\u00ednez-Murillo","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Aitana Morote","family":"Mart\u00ednez","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2926-294X","authenticated-orcid":false,"given":"Elena","family":"Lloret","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.ipm.2026.104922_b1","doi-asserted-by":"crossref","unstructured":"Abjalova, M., & Sharipova, S (2024). Semantic and grammatical issues in translating idioms with automatic translation systems. In 2024 9th international conference on computer science and engineering (pp. 58\u201363).","DOI":"10.1109\/UBMK63289.2024.10773608"},{"key":"10.1016\/j.ipm.2026.104922_b2","series-title":"GPT-4 technical report","author":"Achiam","year":"2023"},{"key":"10.1016\/j.ipm.2026.104922_b3","doi-asserted-by":"crossref","first-page":"1173","DOI":"10.1109\/TASLP.2024.3353574","article-title":"TrICy: Trigger-guided data-to-text generation with intent aware attention-copy","volume":"32","author":"Agarwal","year":"2024","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10.1016\/j.ipm.2026.104922_b4","series-title":"Findings of the Association for Computational Linguistics: EACL 2024","first-page":"912","article-title":"Do language models know when they\u2019re hallucinating references?","author":"Agrawal","year":"2024"},{"key":"10.1016\/j.ipm.2026.104922_b5","series-title":"The falcon series of open language models","author":"Almazrouei","year":"2023"},{"key":"10.1016\/j.ipm.2026.104922_b6","series-title":"The carbon cost of conversation, sustainability in the age of language models","author":"Amiri","year":"2025"},{"key":"10.1016\/j.ipm.2026.104922_b7","doi-asserted-by":"crossref","DOI":"10.1093\/pnasnexus\/pgaf089","article-title":"Measuring gender and racial biases in large language models: Intersectional evidence from automated resume evaluation","volume":"4","author":"An","year":"2025","journal-title":"PNAS Nexus"},{"issue":"3","key":"10.1016\/j.ipm.2026.104922_b8","doi-asserted-by":"crossref","first-page":"2967","DOI":"10.1007\/s10115-024-02310-4","article-title":"Large language models: a survey of their development, capabilities, and applications","volume":"67","author":"Annepaka","year":"2025","journal-title":"Knowledge and Information Systems"},{"key":"10.1016\/j.ipm.2026.104922_b9","series-title":"Findings of the Association for Computational Linguistics: ACL 2024","first-page":"12670","article-title":"Strong hallucinations from negation and how to fix them","author":"Asher","year":"2024"},{"key":"10.1016\/j.ipm.2026.104922_b10","article-title":"Evaluating large language models through the lens of linguistic proficiency and world knowledge: A comparative study","author":"Atox","year":"2024","journal-title":"Authorea Preprints"},{"key":"10.1016\/j.ipm.2026.104922_b11","doi-asserted-by":"crossref","unstructured":"Azhar, U., & Nazir, A (2024). Exploring the natural language generation: Current trends and research challenges. In 2024 international conference on engineering & computing technologies (pp. 1\u20136).","DOI":"10.1109\/ICECT61618.2024.10581073"},{"key":"10.1016\/j.ipm.2026.104922_b12","doi-asserted-by":"crossref","unstructured":"Baziotis, C., Mathur, P., & Hasler, E (2023). Automatic evaluation and analysis of idioms in neural machine translation. In Proceedings of the 17th conference of the European chapter of the association for computational linguistics (pp. 3682\u20133700).","DOI":"10.18653\/v1\/2023.eacl-main.267"},{"key":"10.1016\/j.ipm.2026.104922_b13","series-title":"Findings of the Association for Computational Linguistics: EMNLP 2024","first-page":"15622","article-title":"To ask LLMs about english grammaticality, prompt them in a different language","author":"Behzad","year":"2024"},{"key":"10.1016\/j.ipm.2026.104922_b14","unstructured":"Bestgen, Y. Please, don\u2019t forget the difference and the confidence interval when seeking for the state-of-the-art status. In Proceedings of the Thirteenth Language Resources and Evaluation Conference."},{"key":"10.1016\/j.ipm.2026.104922_b15","doi-asserted-by":"crossref","unstructured":"Cao, Y., Zhou, L., Lee, S., Cabello, L., Chen, M., & Hershcovich, D (2023). Assessing cross-cultural alignment between ChatGPT and human societies: An empirical study. In Proceedings of the First Workshop on Cross-Cultural Considerations in NLP (pp. 53\u201367).","DOI":"10.18653\/v1\/2023.c3nlp-1.7"},{"key":"10.1016\/j.ipm.2026.104922_b16","doi-asserted-by":"crossref","first-page":"22","DOI":"10.1007\/s10916-024-02045-3","article-title":"The breakthrough of large language models release for medical applications: 1-year timeline and perspectives","volume":"48","author":"Cascella","year":"2024","journal-title":"Journal of Medical Systems"},{"key":"10.1016\/j.ipm.2026.104922_b17","series-title":"Language models reach higher agreement than humans in historical interpretation","author":"Celli","year":"2025"},{"key":"10.1016\/j.ipm.2026.104922_b18","series-title":"Metamorphic testing: a new approach for generating next test cases","author":"Chen","year":"1998"},{"issue":"1","key":"10.1016\/j.ipm.2026.104922_b19","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3143561","article-title":"Metamorphic testing: A review of challenges and opportunities","volume":"51","author":"Chen","year":"2018","journal-title":"ACM Computing Surveys"},{"key":"10.1016\/j.ipm.2026.104922_b20","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"17808","article-title":"Benchmarking large language models on controllable generation under diversified instructions","volume":"vol. 38","author":"Chen","year":"2024"},{"issue":"2","key":"10.1016\/j.ipm.2026.104922_b21","doi-asserted-by":"crossref","DOI":"10.1162\/99608f92.5317da47","article-title":"How is ChatGPT\u2019s behavior changing over time?","volume":"6","author":"Chen","year":"2024","journal-title":"Harvard Data Science Review"},{"key":"10.1016\/j.ipm.2026.104922_b22","first-page":"1","article-title":"Scaling instruction-finetuned language models","volume":"25","author":"Chung","year":"2024","journal-title":"Journal of Machine Learning Research"},{"issue":"1","key":"10.1016\/j.ipm.2026.104922_b23","doi-asserted-by":"crossref","first-page":"37","DOI":"10.1177\/001316446002000104","article-title":"A coefficient of agreement for nominal scales","volume":"20","author":"Cohen","year":"1960","journal-title":"Educational and Psychological Measurement"},{"key":"10.1016\/j.ipm.2026.104922_b24","unstructured":"Dalayli, F. (2023). Use of NLP techniques in translation by ChatGPT: Case study. In Proceedings of the Workshop on Computational Terminology in NLP and translation studies (ConTeNTS) Incorporating the 16th Workshop on Building and Using Comparable Corpora (BUCC) (pp. 19\u201325)."},{"key":"10.1016\/j.ipm.2026.104922_b25","doi-asserted-by":"crossref","first-page":"214772","DOI":"10.1109\/ACCESS.2025.3646270","article-title":"Metamorphic testing for semantic invariance in large language models","volume":"13","author":"De Curt\u00f2","year":"2025","journal-title":"IEEE Access"},{"key":"10.1016\/j.ipm.2026.104922_b26","doi-asserted-by":"crossref","unstructured":"Ding, Y. Shi, T (2024). Sustainable LLM serving: Environmental implications, challenges, and opportunities. In 2024 IEEE 15th international green and sustainable computing conference (pp. 37\u201338).","DOI":"10.1109\/IGSC64514.2024.00016"},{"key":"10.1016\/j.ipm.2026.104922_b27","article-title":"A survey of natural language generation","volume":"55","author":"Dong","year":"2022","journal-title":"ACM Computing Surveys"},{"key":"10.1016\/j.ipm.2026.104922_b28","unstructured":"Fu, C., Chen, P., Shen, Y., Qin, Y., Zhang, M., Lin, X., et al. (2025). MME: A comprehensive evaluation benchmark for multimodal large language models. In The Thirty-ninth Annual Conference on Neural Information Processing Systems Datasets and Benchmarks Track."},{"key":"10.1016\/j.ipm.2026.104922_b29","series-title":"Why do large language models (LLMs) struggle to count letters?","author":"Fu","year":"2024"},{"key":"10.1016\/j.ipm.2026.104922_b30","doi-asserted-by":"crossref","unstructured":"Futeral, M., Schmid, C., Laptev, I., Sagot, B., & Bawden, R (2023). Tackling ambiguity with images: Improved multimodal machine translation and contrastive evaluation. In Proceedings of the 61st annual meeting of the association for computational linguistics (volume 1: long papers) (pp. 5394\u20135413).","DOI":"10.18653\/v1\/2023.acl-long.295"},{"key":"10.1016\/j.ipm.2026.104922_b31","doi-asserted-by":"crossref","unstructured":"Galhardi, L., Herculano, M. F., Rodrigues, L., Miranda, P., Oliveira, H., Cordeiro, T., et al. (2024). Contextual features for automatic essay scoring in Portuguese. In International Conference on Artificial Intelligence in Education (pp. 270\u2013282).","DOI":"10.1007\/978-3-031-64315-6_23"},{"key":"10.1016\/j.ipm.2026.104922_b32","doi-asserted-by":"crossref","first-page":"1097","DOI":"10.1162\/coli_a_00524","article-title":"Bias and fairness in large language models: A survey","volume":"50","author":"Gallegos","year":"2024","journal-title":"Computational Linguistics"},{"key":"10.1016\/j.ipm.2026.104922_b33","doi-asserted-by":"crossref","first-page":"661","DOI":"10.1162\/coli_a_00561","article-title":"LLM-based NLG evaluation: Current status and challenges","volume":"51","author":"Gao","year":"2025","journal-title":"Computational Linguistics"},{"key":"10.1016\/j.ipm.2026.104922_b34","article-title":"Why is constrained neural language generation particularly challenging?","author":"Garbacea","year":"2025","journal-title":"Transactions on Machine Learning Research"},{"key":"10.1016\/j.ipm.2026.104922_b35","doi-asserted-by":"crossref","first-page":"1387","DOI":"10.1007\/s10579-023-09670-3","article-title":"MarIA and BETO are sexist: evaluating gender bias in large language models for spanish","volume":"58","author":"Garrido-Mu\u00f1oz","year":"2024","journal-title":"Language Resources and Evaluation"},{"key":"10.1016\/j.ipm.2026.104922_b36","doi-asserted-by":"crossref","first-page":"65","DOI":"10.1613\/jair.5477","article-title":"Survey of the state of the art in natural language generation: Core tasks, applications and evaluation","volume":"61","author":"Gatt","year":"2018","journal-title":"Journal of Artificial Intelligence Research"},{"key":"10.1016\/j.ipm.2026.104922_b37","series-title":"Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021","first-page":"1435","article-title":"Exploring the role of context in utterance-level emotion, act and intent classification in conversations: An empirical study","author":"Ghosal","year":"2021"},{"key":"10.1016\/j.ipm.2026.104922_b38","doi-asserted-by":"crossref","first-page":"43089","DOI":"10.1007\/s11042-023-15224-0","article-title":"A systematic survey on automated text generation tools and techniques: Application, evaluation, and challenges","volume":"82","author":"Goyal","year":"2023","journal-title":"Multimedia Tools Applications"},{"key":"10.1016\/j.ipm.2026.104922_b39","doi-asserted-by":"crossref","unstructured":"Harrison, V., Reed, L., Oraby, S., & Walker, M (2019). Maximizing stylistic control and semantic accuracy in NLG: Personality variation and discourse contrast. In Proceedings of the 1st workshop on discourse structure in neural NLG (pp. 1\u201312).","DOI":"10.18653\/v1\/W19-8101"},{"key":"10.1016\/j.ipm.2026.104922_b40","first-page":"60","article-title":"Potential of large language models (LLMs) as supplementary tools for historical learning: Users\u2019 interaction and knowledge acquisition","volume":"4","author":"Hasnain","year":"2024","journal-title":"Foundation University Journal of Engineering and Applied Sciences (HEC Recognized Y Category, ISSN 2706-7351)"},{"key":"10.1016\/j.ipm.2026.104922_b41","first-page":"32336","article-title":"Large language models\u2019 expert-level global history knowledge benchmark (hist-LLM)","volume":"37","author":"Hauser","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.ipm.2026.104922_b42","doi-asserted-by":"crossref","unstructured":"Hoffmann, J., Borgeaud, S., Mensch, A., Buchatskaya, E., Cai, T., Rutherford, E., Las Casas, D., Hendricks, L., Welbl, J., Clark, A., Hennigan, T., Noland, E., Millican, K., Driessche, G., Damoc, B., Guy, A., Osindero, S., Simonyan, K., Elsen, E., .... Sifre, L (2022). Training compute-optimal large language models. In Proceedings of the 36th international conference on neural information processing systems.","DOI":"10.52202\/068431-2176"},{"key":"10.1016\/j.ipm.2026.104922_b43","doi-asserted-by":"crossref","first-page":"1413","DOI":"10.1109\/TKDE.2023.3310002","article-title":"A survey of knowledge enhanced pre-trained language models","volume":"36","author":"Hu","year":"2023","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"10.1016\/j.ipm.2026.104922_b44","series-title":"Findings of the Association for Computational Linguistics: EMNLP 2023","first-page":"12365","article-title":"Not all languages are created equal in LLMs: Improving multilingual capability by cross-lingual-thought prompting","author":"Huang","year":"2023"},{"key":"10.1016\/j.ipm.2026.104922_b45","doi-asserted-by":"crossref","DOI":"10.1145\/3703155","article-title":"A survey on hallucination in large language models: Principles, taxonomy, challenges, and open questions","volume":"43","author":"Huang","year":"2025","journal-title":"ACM Transactions in Information Systems"},{"key":"10.1016\/j.ipm.2026.104922_b46","doi-asserted-by":"crossref","unstructured":"Ignat, O., Jin, Z., Abzaliev, A., Biester, L., Castro, S., Deng, N., Gao, X., Gunal, A., He, J., Kazemi, A., Khalifa, M., Koh, N., Lee, A., Liu, S., Min, D., Mori, S., Nwatu, J., P\u00e9rez-Rosas, V., Shen, S., .... Mihalcea, R (2024). Has it all been solved? Open NLP research questions not solved by large language models. In Proceedings of the 2024 joint international conference on computational linguistics, language resources and evaluation (pp. 8050\u20138094).","DOI":"10.63317\/2bw7xfcwyeq3"},{"key":"10.1016\/j.ipm.2026.104922_b47","doi-asserted-by":"crossref","unstructured":"Jain, D., Agarwal, A., Baliyan, S., & Kanagaraj, R. (2025). The carbon footprint of intelligence: The environment cost of LLMs. In 2025 9th international conference on electronics, communication and aerospace technology (pp. 2069\u20132075).","DOI":"10.1109\/ICECA66444.2025.11383190"},{"key":"10.1016\/j.ipm.2026.104922_b48","series-title":"Proceedings of ArabicNLP 2023","first-page":"359","article-title":"SALMA: Arabic sense-annotated corpus and WSD benchmarks","author":"Jarrar","year":"2023"},{"key":"10.1016\/j.ipm.2026.104922_b49","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3571730","article-title":"Survey of hallucination in natural language generation","volume":"55","author":"Ji","year":"2023","journal-title":"ACM Computing Surveys"},{"key":"10.1016\/j.ipm.2026.104922_b50","doi-asserted-by":"crossref","first-page":"41","DOI":"10.1007\/978-981-97-0747-8_4","article-title":"Knowledge-augmented methods for natural language generation","author":"Jiang","year":"2024","journal-title":"Knowledge-Augmented Methods for Natural Language Processing"},{"key":"10.1016\/j.ipm.2026.104922_b51","series-title":"Mixtral of experts","author":"Jiang","year":"2024"},{"key":"10.1016\/j.ipm.2026.104922_b52","series-title":"Instruction-tuned language models are better knowledge learners","author":"Jiang","year":"2024"},{"key":"10.1016\/j.ipm.2026.104922_b53","series-title":"Mistral 7B","author":"Jiang","year":"2023"},{"key":"10.1016\/j.ipm.2026.104922_b54","doi-asserted-by":"crossref","unstructured":"Joshi, I., Shahid, S., Venneti, S. M., Vasu, M., Zheng, Y., Li, Y., et al. (2025). Coprompter User-centric evaluation of LLM instruction alignment for improved prompt engineering. In Proceedings of the 30th international conference on intelligent user interfaces (pp. 341\u2013365).","DOI":"10.1145\/3708359.3712102"},{"key":"10.1016\/j.ipm.2026.104922_b55","doi-asserted-by":"crossref","unstructured":"Kajiwara, T., Chu, C., Takemura, N., Nakashima, Y., & Nagahara, H (2021). WRIME: A new dataset for emotional intensity estimation with subjective and objective annotations. In Proceedings of the 2021 conference of the North American chapter of the association for computational linguistics: human language technologies (pp. 2095\u20132104).","DOI":"10.18653\/v1\/2021.naacl-main.169"},{"key":"10.1016\/j.ipm.2026.104922_b56","series-title":"Why language models hallucinate","author":"Kalai","year":"2025"},{"key":"10.1016\/j.ipm.2026.104922_b57","series-title":"Large Language Models: A Deep Dive: Bridging Theory And Practice","first-page":"219","article-title":"LLM challenges and solutions","author":"Kamath","year":"2024"},{"key":"10.1016\/j.ipm.2026.104922_b58","series-title":"Findings of the Association for Computational Linguistics: ACL 2024","first-page":"8940","article-title":"Investigating subtler biases in LLMs: Ageism, beauty, institutional, and nationality bias in generative models","author":"Kamruzzaman","year":"2024"},{"key":"10.1016\/j.ipm.2026.104922_b59","series-title":"Findings of the Association for Computational Linguistics: EMNLP 2023","first-page":"14197","article-title":"TELeR: A general taxonomy of LLM prompts for benchmarking complex tasks","author":"Karmaker Santu","year":"2023"},{"key":"10.1016\/j.ipm.2026.104922_b60","series-title":"2025 8th international conference on computing methodologies and communication","first-page":"944","article-title":"Toward sustainable AI: A review of energy-efficient large language models","author":"Kaushik","year":"2025"},{"key":"10.1016\/j.ipm.2026.104922_b61","doi-asserted-by":"crossref","unstructured":"Kotek, H., Dockum, R., & Sun, D (2023). Gender bias and stereotypes in large language models. In Proceedings of the ACM Collective Intelligence Conference (pp. 12\u201324).","DOI":"10.1145\/3582269.3615599"},{"key":"10.1016\/j.ipm.2026.104922_b62","series-title":"Large Language Models in Cybersecurity: Threats, Exposure and Mitigation","first-page":"31","article-title":"Overview of existing LLM families","author":"Kucharavy","year":"2024"},{"key":"10.1016\/j.ipm.2026.104922_b63","doi-asserted-by":"crossref","unstructured":"Lewis, M., Liu, Y., Goyal, N., Ghazvininejad, M., Mohamed, A., Levy, O., Stoyanov, V., & Zettlemoyer, L (2020). BART: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. In Proceedings of the 58th annual meeting of the association for computational linguistics (pp. 7871\u20137880).","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"10.1016\/j.ipm.2026.104922_b64","doi-asserted-by":"crossref","unstructured":"Li, J., Chen, J., Ren, R., Cheng, X., Zhao, W., Nie, J., & Wen, J (2024). The dawn after the dark: An empirical study on factuality hallucination in large language models. In Proceedings of the 62nd annual meeting of the association for computational linguistics (volume 1: long papers) (pp. 10879\u201310899).","DOI":"10.18653\/v1\/2024.acl-long.586"},{"key":"10.1016\/j.ipm.2026.104922_b65","doi-asserted-by":"crossref","DOI":"10.1145\/3722552","article-title":"From matching to generation: A survey on generative information retrieval","author":"Li","year":"2025","journal-title":"ACM Transactions on Information Systems"},{"key":"10.1016\/j.ipm.2026.104922_b66","series-title":"From misleading queries to accurate answers: A three-stage fine-tuning method for LLMs","author":"Li","year":"2025"},{"key":"10.1016\/j.ipm.2026.104922_b67","series-title":"Holistic evaluation of language models","author":"Liang","year":"2022"},{"key":"10.1016\/j.ipm.2026.104922_b68","series-title":"Controllable text generation for large language models: a survey","author":"Liang","year":"2024"},{"key":"10.1016\/j.ipm.2026.104922_b69","doi-asserted-by":"crossref","unstructured":"Lin, B., Lee, S., Qiao, X., & Ren, X (2021). Common sense beyond English: Evaluating and improving multilingual language models for commonsense reasoning. In Proceedings of the 59th annual meeting of the association for computational linguistics and the 11th international joint conference on natural language processing (volume 1: long papers) (pp. 1274\u20131287).","DOI":"10.18653\/v1\/2021.acl-long.102"},{"key":"10.1016\/j.ipm.2026.104922_b70","first-page":"24","article-title":"A survey of hallucination problems based on large language models","author":"Liu","year":"2024","journal-title":"Applied and Computational Engineering"},{"key":"10.1016\/j.ipm.2026.104922_b71","doi-asserted-by":"crossref","unstructured":"Liu, J., Chen, S., Cheng, Y., & He, J (2024). On the universal truthfulness hyperplane inside LLMs. In Proceedings of the 2024 conference on empirical methods in natural language processing (pp. 18199\u201318224).","DOI":"10.18653\/v1\/2024.emnlp-main.1012"},{"key":"10.1016\/j.ipm.2026.104922_b72","series-title":"Robustness over time: Understanding adversarial examples\u2019 effectiveness on longitudinal versions of large language models","author":"Liu","year":"2023"},{"key":"10.1016\/j.ipm.2026.104922_b73","doi-asserted-by":"crossref","unstructured":"Liu, M., Liu, F., Fiannaca, A., Koo, T., Dixon, L., Terry, M., & Cai, C. (2024). We Need Structured Output: Towards user-centered constraints on large language model output. In Extended Abstracts of the CHI Conference on Human Factors in Computing Systems.","DOI":"10.1145\/3613905.3650756"},{"key":"10.1016\/j.ipm.2026.104922_b74","doi-asserted-by":"crossref","unstructured":"Liu, S., Maturi, T., Yi, B., Shen, S., & Mihalcea, R (2024). The generation gap: Exploring age bias in the value systems of large language models. In Proceedings of the 2024 conference on empirical methods in natural language processing (pp. 19617\u201319634).","DOI":"10.18653\/v1\/2024.emnlp-main.1094"},{"key":"10.1016\/j.ipm.2026.104922_b75","series-title":"Is translation all you need? A study on solving multilingual tasks with large language models","author":"Liu","year":"2024"},{"key":"10.1016\/j.ipm.2026.104922_b76","doi-asserted-by":"crossref","unstructured":"Ma, P., Wang, S., & Liu, J. (2020). Metamorphic testing and certified mitigation of fairness violations in NLP models. In Proceedings of the Twenty-Ninth International Joint Conference on Artificial Intelligence (IJCAI), Main track, vol. 20 (pp. 458\u2013465).","DOI":"10.24963\/ijcai.2020\/64"},{"key":"10.1016\/j.ipm.2026.104922_b77","unstructured":"Maestre, M., Mart\u00ednez-Murillo, I., Lloret, E., Moreda, P., & Cueto, A (2024). COCOTEROS: A Spanish corpus with contextual knowledge for natural language generation. In 40th annual conference of the spanish association for natural language processing."},{"key":"10.1016\/j.ipm.2026.104922_b78","first-page":"67","article-title":"Roadmap for natural language generation: Challenges and insights","volume":"74","author":"Maestre","year":"2025","journal-title":"Procesamiento Del Lenguaje Natural"},{"key":"10.1016\/j.ipm.2026.104922_b79","doi-asserted-by":"crossref","unstructured":"Mao, R., Tan, L., & Moieni, R (2023). Developing a large-scale language model to unveil and alleviate gender and age biases in Australian job ads. In 2023 IEEE international conference on big data (bigData) (pp. 4176\u20134185).","DOI":"10.1109\/BigData59044.2023.10386083"},{"key":"10.1016\/j.ipm.2026.104922_b80","unstructured":"Martelli, F., Perrella, S., Campolungo, N., Munda, T., Koeva, S., Tiberius, C., & Navigli, R DiBiMT: A gold evaluation benchmark for studying lexical ambiguity in machine translation. Computational Linguistics."},{"key":"10.1016\/j.ipm.2026.104922_b81","series-title":"Gemma: open models based on gemini research and technology","author":"Mesnard","year":"2024"},{"key":"10.1016\/j.ipm.2026.104922_b82","doi-asserted-by":"crossref","first-page":"158","DOI":"10.1016\/j.jpedsurg.2023.08.018","article-title":"Bibliographic research with ChatGPT may be misleading: the problem of hallucination","volume":"59","author":"Metze","year":"2024","journal-title":"J Pediatric Surgery"},{"key":"10.1016\/j.ipm.2026.104922_b83","doi-asserted-by":"crossref","first-page":"95851","DOI":"10.1109\/ACCESS.2025.3573955","article-title":"A comprehensive overview and analysis of large language models: Trends and challenges","volume":"13","author":"Mohammed","year":"2025","journal-title":"IEEE Access"},{"key":"10.1016\/j.ipm.2026.104922_b84","series-title":"Findings of the Association for Computational Linguistics: EMNLP 2022","first-page":"1919","article-title":"Audience-centric natural language generation via style infusion","author":"Moorjani","year":"2022"},{"key":"10.1016\/j.ipm.2026.104922_b85","series-title":"2023 IEEE\/ACM 45th international conference on software engineering","first-page":"768","article-title":"Developer-intent driven code comment heneration","author":"Mu","year":"2023"},{"key":"10.1016\/j.ipm.2026.104922_b86","series-title":"Proceedings of the 59th annual meeting of the association for computational linguistics and the 11th international joint conference on natural language processing (volume 1: long papers)","first-page":"5356","article-title":"StereoSet: Measuring stereotypical bias in pretrained language models","author":"Nadeem","year":"2021"},{"key":"10.1016\/j.ipm.2026.104922_b87","doi-asserted-by":"crossref","unstructured":"Narayanan Venkit, P., Chakravorti, T., Gupta, V., Biggs, H., Srinath, M., Goswami, K., Rajtmajer, S. Wilson, S (2024). An audit on the perspectives and challenges of hallucinations in NLP. In Proceedings of the 2024 conference on empirical methods in natural language processing (pp. 6528\u20136548).","DOI":"10.18653\/v1\/2024.emnlp-main.375"},{"issue":"5","key":"10.1016\/j.ipm.2026.104922_b88","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3744746","article-title":"A comprehensive overview of large language models","volume":"16","author":"Naveed","year":"2025","journal-title":"ACM Transactions on Intelligent Systems and Technology"},{"key":"10.1016\/j.ipm.2026.104922_b89","doi-asserted-by":"crossref","DOI":"10.1016\/j.isci.2024.110878","article-title":"Overview and challenges of machine translation for contextually appropriate translations","volume":"27","author":"Naveen","year":"2024","journal-title":"IScience"},{"key":"10.1016\/j.ipm.2026.104922_b90","doi-asserted-by":"crossref","unstructured":"Nguyen, T., Razniewski, S., Varde, A., & Weikum, G (2023). Extracting cultural commonsense Knowledge at scale. In Proceedings of the ACM web conference 2023 (pp. 1907\u20131917).","DOI":"10.1145\/3543507.3583535"},{"key":"10.1016\/j.ipm.2026.104922_b91","doi-asserted-by":"crossref","unstructured":"Nguyen, T., Razniewski, S., & Weikum, G (2024). Cultural commonsense knowledge for intercultural dialogues. In Proceedings of the 33rd ACM international conference on information and knowledge management (pp. 1774\u20131784).","DOI":"10.1145\/3627673.3679768"},{"key":"10.1016\/j.ipm.2026.104922_b92","doi-asserted-by":"crossref","first-page":"195","DOI":"10.1038\/s41746-023-00939-z","article-title":"Large language models propagate race-based medicine","volume":"6","author":"Omiye","year":"2023","journal-title":"NPJ Digital Medicine"},{"key":"10.1016\/j.ipm.2026.104922_b93","series-title":"Proceedings of the 2025 conference of the nations of the americas chapter of the association for computational linguistics: human language technologies (volume 3: industry track)","first-page":"1","article-title":"Understanding LLM development through longitudinal study: Insights from the open Ko-LLM leaderboard","author":"Park","year":"2025"},{"key":"10.1016\/j.ipm.2026.104922_b94","series-title":"LLMLagBench: identifying temporal training boundaries in large language models","author":"P\u0119zik","year":"2025"},{"key":"10.1016\/j.ipm.2026.104922_b95","doi-asserted-by":"crossref","unstructured":"Pokharel, R., & Agrawal, A. (2025). MTQ-Eval: Multilingual text quality evaluation for language models. In Proceedings of the 14th international joint conference on natural language processing and the 4th conference of the Asia-Pacific chapter of the association for computational linguistics (pp. 1289\u20131304).","DOI":"10.18653\/v1\/2025.findings-ijcnlp.79"},{"key":"10.1016\/j.ipm.2026.104922_b96","doi-asserted-by":"crossref","unstructured":"Prabhumoye, S., Black, A., & Salakhutdinov, R (2020). Exploring controllable text generation techniques. In Proceedings of the 28th international conference on computational linguistics (pp. 1\u201314).","DOI":"10.18653\/v1\/2020.coling-main.1"},{"key":"10.1016\/j.ipm.2026.104922_b97","series-title":"How easy is it to fool your multimodal llms? An empirical analysis on deceptive prompts","first-page":"2","author":"Qian","year":"2024"},{"key":"10.1016\/j.ipm.2026.104922_b98","series-title":"Improving language understanding by generative pre-training","author":"Radford","year":"2018"},{"key":"10.1016\/j.ipm.2026.104922_b99","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"Journal of Machine Learning Research"},{"key":"10.1016\/j.ipm.2026.104922_b100","doi-asserted-by":"crossref","unstructured":"Ramu, P., Gaur, P., Emandi, R., Maheshwari, H., Javed, D., & Garimella, A (2024). Zooming in on zero-shot intent-guided and grounded document generation using LLMs. In Proceedings of the 17th international natural language generation conference (pp. 676\u2013694).","DOI":"10.18653\/v1\/2024.inlg-main.52"},{"key":"10.1016\/j.ipm.2026.104922_b101","series-title":"HALoGEN: Fantastic LLM hallucinations and where to find them","author":"Ravichander","year":"2025"},{"key":"10.1016\/j.ipm.2026.104922_b102","doi-asserted-by":"crossref","unstructured":"Rawte, V., Chakraborty, S., Pathak, A., Sarkar, A., Tonmoy, S., Chadha, A., Sheth, A., & Das, A (2023). The troubling emergence of hallucination in large language models - An extensive definition, quantification, and prescriptive remediations. In Proceedings of the 2023 conference on empirical methods in natural language processing (pp. 2541\u20132573).","DOI":"10.18653\/v1\/2023.emnlp-main.155"},{"key":"10.1016\/j.ipm.2026.104922_b103","doi-asserted-by":"crossref","unstructured":"Reiter, E. (1994). Has a consensus NL generation architecture appeared, and is it psycholinguistically plausible?. In Proceedings of the Seventh International Workshop on Natural Language Generation (pp. 163\u2013170).","DOI":"10.3115\/1641417.1641436"},{"key":"10.1016\/j.ipm.2026.104922_b104","series-title":"Validity, reliability, and Significance. Synthesis Lectures on Human Language Technologies","doi-asserted-by":"crossref","first-page":"105","DOI":"10.1007\/978-3-031-57065-0_4","article-title":"Significance","author":"Riezler","year":"2024"},{"key":"10.1016\/j.ipm.2026.104922_b105","series-title":"What\u2019s in a name? Auditing large language models for race and gender bias","author":"Salinas","year":"2024"},{"key":"10.1016\/j.ipm.2026.104922_b106","doi-asserted-by":"crossref","first-page":"39","DOI":"10.1007\/978-981-96-3311-1_4","article-title":"Mitigating hallucinations in large language models: A comprehensive survey on detection and reduction strategies","author":"Saxena","year":"2025","journal-title":"Sustainable Computing and Intelligent Systems"},{"key":"10.1016\/j.ipm.2026.104922_b107","series-title":"Findings of the Association for Computational Linguistics: EMNLP 2023","first-page":"7977","article-title":"ZeroSCROLLS: A zero-shot benchmark for long text understanding","author":"Shaham","year":"2023"},{"key":"10.1016\/j.ipm.2026.104922_b108","doi-asserted-by":"crossref","unstructured":"Shen, S., Logeswaran, L., Lee, M., Lee, H., Poria, S., & Mihalcea, R (2024). Understanding the capabilities and limitations of large language models for cultural commonsense. In Proceedings of the 2024 conference of the North American chapter of the association for computational linguistics: human language technologies (volume 1: long papers) (pp. 5668\u20135680).","DOI":"10.18653\/v1\/2024.naacl-long.316"},{"key":"10.1016\/j.ipm.2026.104922_b109","series-title":"Proceedings of the 59th annual meeting of the association for computational linguistics and the 11th international joint conference on natural language processing (volume 1: long papers)","first-page":"4275","article-title":"Societal biases in language generation: Progress and challenges","author":"Sheng","year":"2021"},{"key":"10.1016\/j.ipm.2026.104922_b110","series-title":"Large language models lack understanding of character composition of words","author":"Shin","year":"2024"},{"key":"10.1016\/j.ipm.2026.104922_b111","first-page":"61836","article-title":"On the exploitability of instruction tuning","volume":"36","author":"Shu","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.ipm.2026.104922_b112","doi-asserted-by":"crossref","first-page":"108","DOI":"10.1111\/iej.13985","article-title":"Unveiling the ChatGPT phenomenon: evaluating the consistency and accuracy of endodontic question answers","author":"Su\u00e1rez","year":"2024","journal-title":"International Endodontic Journal"},{"key":"10.1016\/j.ipm.2026.104922_b113","doi-asserted-by":"crossref","DOI":"10.1145\/3475872","article-title":"Response generation by jointly modeling personalized linguistic styles and emotions","volume":"18","author":"Sun","year":"2022","journal-title":"ACM Transactions on Multimedia Computing, Communications and Applications"},{"key":"10.1016\/j.ipm.2026.104922_b114","doi-asserted-by":"crossref","unstructured":"Tjuatja, L., Neubig, G., Linzen, T., & Hao, S. (2025). What goes into a LM acceptability judgment? Rethinking the impact of frequency and length. In Proceedings of the 2025 conference of the nations of the americas chapter of the association for computational linguistics: human language technologies (volume 1: long papers) (pp. 2173\u20132186).","DOI":"10.18653\/v1\/2025.naacl-long.109"},{"key":"10.1016\/j.ipm.2026.104922_b115","series-title":"Llama: open and efficient foundation language models","author":"Touvron","year":"2023"},{"issue":"2","key":"10.1016\/j.ipm.2026.104922_b116","doi-asserted-by":"crossref","first-page":"399","DOI":"10.1007\/s11336-014-9439-4","article-title":"A new interpretation of the weighted kappa coefficients","volume":"81","author":"Vanbelle","year":"2016","journal-title":"Psychometrika"},{"key":"10.1016\/j.ipm.2026.104922_b117","series-title":"Advances In Neural Information Processing Systems","article-title":"Attention is all you need","volume":"vol. 30","author":"Vaswani","year":"2017"},{"key":"10.1016\/j.ipm.2026.104922_b118","doi-asserted-by":"crossref","unstructured":"Vazquez Risco, A., Ramirez, A., Pullabhotla, N., Qiang, N., Zhang, H., Walker, M., & Torres, M (2024). Knowledge-grounded dialogue act transfer using prompt-based learning for controllable open-domain NLG. In Proceedings of the 25th annual meeting of the special interest group on discourse and dialogue (pp. 78\u201391).","DOI":"10.18653\/v1\/2024.sigdial-1.7"},{"key":"10.1016\/j.ipm.2026.104922_b119","doi-asserted-by":"crossref","first-page":"19","DOI":"10.1145\/3688007","article-title":"GPTs and hallucination: Why do large language models hallucinate?","volume":"22","author":"Waldo","year":"2024","journal-title":"Queue"},{"key":"10.1016\/j.ipm.2026.104922_b120","doi-asserted-by":"crossref","unstructured":"Wang, Y., Cheng, L., & Liu, C. (2025). A survey of lifecycle carbon emissions of LLMs. In 2025 IEEE 4th international conference on industrial electronics for sustainable energy systems (pp. 292\u2013297).","DOI":"10.1109\/IESES66335.2025.11359915"},{"key":"10.1016\/j.ipm.2026.104922_b121","doi-asserted-by":"crossref","first-page":"51","DOI":"10.1016\/j.eng.2022.04.024","article-title":"Pre-trained language models and their applications","volume":"25","author":"Wang","year":"2023","journal-title":"Engineering"},{"key":"10.1016\/j.ipm.2026.104922_b122","series-title":"Bloom: a 176b-parameter open-access multilingual language model","author":"Workshop","year":"2022"},{"key":"10.1016\/j.ipm.2026.104922_b123","article-title":"A survey on multilingual large language models: Corpora, alignment, and bias","volume":"19","author":"Xu","year":"2025","journal-title":"Frontiers in Computer Science"},{"key":"10.1016\/j.ipm.2026.104922_b124","doi-asserted-by":"crossref","unstructured":"Xu, N., & Ma, X (2025). LLM The genius paradox: A linguistic and math expert\u2019s struggle with simple word-based counting problems. In Proceedings of the 2025 conference of the nations of the americas chapter of the association for computational linguistics: human language technologies (volume 1: long papers) (pp. 3344\u20133370).","DOI":"10.18653\/v1\/2025.naacl-long.172"},{"key":"10.1016\/j.ipm.2026.104922_b125","doi-asserted-by":"crossref","first-page":"3091","DOI":"10.1109\/TKDE.2024.3360454","article-title":"Give us the facts: Enhancing large language models with knowledge graphs for fact-aware language modeling","volume":"36","author":"Yang","year":"2024","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"10.1016\/j.ipm.2026.104922_b126","doi-asserted-by":"crossref","unstructured":"Yu, F., Seedat, N., Herrmannova, D., Schilder, F., & Schwarz, J. R. (2025). Beyond pointwise scores: Decomposed criteria-based evaluation of LLM responses. In Proceedings of the 2025 conference on empirical methods in natural language processing: industry track (pp. 1931\u20131954).","DOI":"10.18653\/v1\/2025.emnlp-industry.136"},{"key":"10.1016\/j.ipm.2026.104922_b127","doi-asserted-by":"crossref","unstructured":"Zamfirescu-Pereira, J. D., Wong, R. Y., Hartmann, B., & Yang, Q (2023). Why Johnny can\u2019t prompt: How non-AI experts try (and fail) to design LLM prompts. In Proceedings of the 2023 CHI conference on human factors in computing systems (pp. 1\u201321).","DOI":"10.1145\/3544548.3581388"},{"key":"10.1016\/j.ipm.2026.104922_b128","doi-asserted-by":"crossref","DOI":"10.1002\/0470011815.b2a15150","article-title":"Spearman rank correlation","volume":"7","author":"Zar","year":"2005","journal-title":"Encyclopedia of Biostatistics"},{"key":"10.1016\/j.ipm.2026.104922_b129","doi-asserted-by":"crossref","unstructured":"Zellers, R., Holtzman, A., Bisk, Y., Farhadi, A., & Choi, Y (2019). HellaSwag: Can a machine really finish your sentence?. In Proceedings of the 57th annual meeting of the association for computational linguistics (pp. 4791\u20134800).","DOI":"10.18653\/v1\/P19-1472"},{"key":"10.1016\/j.ipm.2026.104922_b130","series-title":"Counting ability of large language models and impact of tokenization","author":"Zhang","year":"2024"},{"key":"10.1016\/j.ipm.2026.104922_b131","doi-asserted-by":"crossref","unstructured":"Zhang, X., Li, S., Hauer, B., Shi, N., & Kondrak, G (2023). Don\u2019t trust ChatGPT when your question is not in English: A study of multilingual abilities and types of LLMs. In Proceedings of the 2023 conference on empirical methods in natural language processing (pp. 7915\u20137927).","DOI":"10.18653\/v1\/2023.emnlp-main.491"},{"key":"10.1016\/j.ipm.2026.104922_b132","series-title":"LLMEval-3: a large-scale longitudinal study on robust and fair evaluation of large language models","author":"Zhang","year":"2025"},{"key":"10.1016\/j.ipm.2026.104922_b133","first-page":"1","article-title":"A survey of controllable text generation using transformer-based pre-trained language models","volume":"56","author":"Zhang","year":"2023","journal-title":"ACM Computing Surveys"}],"container-title":["Information Processing &amp; Management"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0306457326003134?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0306457326003134?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,30]],"date-time":"2026-05-30T16:31:51Z","timestamp":1780158711000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0306457326003134"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,12]]},"references-count":133,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2026,12]]}},"alternative-id":["S0306457326003134"],"URL":"https:\/\/doi.org\/10.1016\/j.ipm.2026.104922","relation":{},"ISSN":["0306-4573"],"issn-type":[{"value":"0306-4573","type":"print"}],"subject":[],"published":{"date-parts":[[2026,12]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Cross-sectional analysis of large language models in current natural language generation challenges","name":"articletitle","label":"Article Title"},{"value":"Information Processing & Management","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.ipm.2026.104922","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 The Authors. Published by Elsevier Ltd.","name":"copyright","label":"Copyright"}],"article-number":"104922"}}