{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T16:15:08Z","timestamp":1778084108000,"version":"3.51.4"},"reference-count":98,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T00:00:00Z","timestamp":1770595200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T00:00:00Z","timestamp":1770595200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"National Natural Science Foundation of China (NSFC) project (","award":["o.62276193)"],"award-info":[{"award-number":["o.62276193)"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s10994-026-07011-3","type":"journal-article","created":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T13:14:00Z","timestamp":1772802840000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["How Robust are Large Language Models Against Word-Level Spurious Correlations? A Causal Discovery Approach"],"prefix":"10.1007","volume":"115","author":[{"given":"Xin","family":"Miao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yongqi","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hankun","family":"Kang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mayi","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jintao","family":"Wen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuyang","family":"Ren","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tieyun","family":"Qian","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,2,9]]},"reference":[{"key":"7011_CR1","unstructured":"Abdulaal, A., Montana-Brown, N., He, T., Ijishakin, A., Drobnjak, I., Castro, D. C., Alexander, D. C., et al. (2023). Causal modelling agents: Causal graph discovery through synergising metadata-and data-driven reasoning. In The Twelfth international conference on learning representations."},{"key":"7011_CR2","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F. L., Almeida, D., Altenschmidt, J., Altman, S., Anadkat, S., et al. (2023). GPT-4 technical report. arXiv preprint. arXiv:2303.08774"},{"issue":"2","key":"7011_CR3","doi-asserted-by":"publisher","first-page":"505","DOI":"10.1214\/aos\/1031833662","volume":"25","author":"SA Andersson","year":"1997","unstructured":"Andersson, S. A., Madigan, D., & Perlman, M. D. (1997). A characterization of Markov equivalence classes for acyclic digraphs. The Annals of Statistics, 25(2), 505\u2013541.","journal-title":"The Annals of Statistics"},{"key":"7011_CR4","doi-asserted-by":"crossref","unstructured":"Ashwani, S., Hegde, K., Mannuru, N. R., Sengar, D. S., Jindal, M., Kathala, K. C. R., Banga, D., Jain, V., & Chadha, A. (2024). Cause and effect: Can large language models truly understand causality? In Proceedings of the AAAI symposium series (Vol. 4, pp. 2\u20139).","DOI":"10.1609\/aaaiss.v4i1.31764"},{"key":"7011_CR5","volume-title":"Large language models for reliable information extraction","author":"L Baliunas","year":"2023","unstructured":"Baliunas, L. (2023). Large language models for reliable information extraction. Department of Engineering, University of Cambridge."},{"key":"7011_CR6","unstructured":"Carlini, N., Tramer, F., Wallace, E., Jagielski, M., Herbert-Voss, A., Lee, K., Roberts, A., Brown, T., Song, D., Erlingsson, U., & et al. (2021). Extracting training data from large language models. In 30th USENIX Security Symposium (USENIX Security 21) (pp. 2633\u20132650)."},{"key":"7011_CR7","unstructured":"Chen, H. (2023). Large knowledge model: Perspectives and challenges. arXiv preprint. arXiv:2312.02706"},{"key":"7011_CR8","unstructured":"Cheng, Y., Chang, Y., & Wu, Y. (2025). A survey on data contamination for large language models. arXiv e-prints, 2502."},{"issue":"2","key":"7011_CR9","doi-asserted-by":"publisher","first-page":"110","DOI":"10.1038\/s42256-022-00445-z","volume":"4","author":"P Cui","year":"2022","unstructured":"Cui, P., & Athey, S. (2022). Stable learning establishes some common ground between causal inference and machine learning. Nature Machine Intelligence, 4(2), 110\u2013115.","journal-title":"Nature Machine Intelligence"},{"key":"7011_CR10","doi-asserted-by":"crossref","unstructured":"Deng, C., Zhao, Y., Tang, X., Gerstein, M., & Cohan, A. (2024). Investigating data contamination in modern benchmarks for large language models. In Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human language technologies (Volume 1: Long Papers) (pp. 8698\u20138711).","DOI":"10.18653\/v1\/2024.naacl-long.482"},{"key":"7011_CR11","doi-asserted-by":"crossref","unstructured":"Ding, N., Xu, G., Chen, Y., Wang, X., Han, X., Xie, P., Zheng, H., & Liu, Z. (2021). Few-nerd: A few-shot named entity recognition dataset. In Proceedings of the 59th annual meeting of the Association for Computational Linguistics and the 11th international joint conference on natural language processing (Volume 1: Long Papers) (pp. 3198\u20133213).","DOI":"10.18653\/v1\/2021.acl-long.248"},{"key":"7011_CR12","unstructured":"Dong, Q., Li, L., Dai, D., Zheng, C., Ma, J., Li, R., Xia, H., Xu, J., Wu, Z., Liu, T., et al. (2022). A survey on in-context learning. arXiv preprint arXiv:2301.00234"},{"key":"7011_CR13","unstructured":"Dubey, A., Jauhri, A., Pandey, A., Kadian, A., Al-Dahle, A., Letman, A., Mathur, A., Schelten, A., Yang, A., Fan, A., et al. (2024). The Llama 3 herd of models. arXiv e-prints. arXiv:2407.217832407"},{"key":"7011_CR14","unstructured":"Elazar, Y., Kassner, N., Ravfogel, S., Feder, A., Ravichander, A., Mosbach, M., Belinkov, Y., Sch\u00fctze, H., & Goldberg, Y. (2022). Measuring causal effects of data statistics on language model\u2019sfactual\u2019predictions. arXiv preprint. arXiv:2207.14251"},{"key":"7011_CR15","unstructured":"Gan, Y., Yang, Y., Ma, Z., He, P., Zeng, R., Wang, Y., Li, Q., Zhou, C., Li, S., Wang, T., et al. (2024). Navigating the risks: A survey of security, privacy, and ethics threats in LLM-based agents. arXiv preprint. arXiv:2411.09523"},{"key":"7011_CR16","doi-asserted-by":"crossref","unstructured":"Gao, J., Ding, X., Qin, B., & Liu, T. (2023). Ischatgpt a good causal reasoner? A comprehensive evaluation. In Findings of the Association for Computational Linguistics: EMNLP 2023 (pp. 11111\u201311126).","DOI":"10.18653\/v1\/2023.findings-emnlp.743"},{"issue":"11","key":"7011_CR17","doi-asserted-by":"publisher","first-page":"665","DOI":"10.1038\/s42256-020-00257-z","volume":"2","author":"R Geirhos","year":"2020","unstructured":"Geirhos, R., Jacobsen, J.-H., Michaelis, C., Zemel, R., Brendel, W., Bethge, M., & Wichmann, F. A.(2020). Shortcut learning in deep neural networks. Nature Machine Intelligence, 2(11), 665\u2013673.","journal-title":"Nature Machine Intelligence"},{"key":"7011_CR18","unstructured":"Goel, A., Gueta, A., Gilon, O., Liu, C., Erell, S., Nguyen, L.H., Hao, X., Jaber, B., Reddy, S., Kartha, R., et al. (2023). LLMS accelerate annotation for medical information extraction. In Machine learning for health (ML4H) (pp. 82\u2013100). PMLR."},{"key":"7011_CR19","unstructured":"Grishman, R., Westbrook, D., & Meyers, A.(2005). Nyu\u2019s English ACE 2005 system description. In ACE 2005 evaluation workshop."},{"key":"7011_CR20","unstructured":"Guo, D., Yang, D., Zhang, H., Song, J., Zhang, R., Xu, R., Zhu, Q., Ma, S., Wang, P., Bi, X., et al. (2025). Deepseek-r1: Incentivizing reasoning capability in LLMS via reinforcement learning. arXiv preprint. arXiv:2501.12948"},{"issue":"5","key":"7011_CR21","doi-asserted-by":"publisher","first-page":"2351","DOI":"10.1007\/s10994-023-06495-7","volume":"113","author":"Z Hammoudeh","year":"2024","unstructured":"Hammoudeh, Z., & Lowd, D. (2024). Training data influence analysis and estimation: A survey. Machine Learning, 113(5), 2351\u20132403.","journal-title":"Machine Learning"},{"key":"7011_CR22","first-page":"33","volume":"2010","author":"I Hendrickx","year":"2010","unstructured":"Hendrickx, I., Kim, S. N., Kozareva, Z., Nakov, P., S\u00e9aghdha, D. O., Pad\u00f3, S., Pennacchiotti, M., Romano, L., & Szpakowicz, S. (2010). Semeval-2010 task 8: Multi-way classification of semantic relations between pairs of nominals. ACL, 2010, 33.","journal-title":"ACL"},{"key":"7011_CR23","unstructured":"Hobbhahn, M., Lieberum, T., & Seiler, D. (2022). Investigating causal understanding in LLMS. In NeurIPS ML safety workshop."},{"key":"7011_CR24","unstructured":"Hu, X., Chen, J., Li, X., Guo, Y., Wen, L., Philip, S.Y., & Guo, Z. (2024). Towards understanding factual knowledge of large language models. In ICLR."},{"key":"7011_CR25","doi-asserted-by":"crossref","unstructured":"Hu, Y., Liu, H., Chen, Q., Zheng, N., Wang, C., Liu, Y., Clarke, C.L., & Shen, W. (2025). J&H: Evaluating the robustness of large language models under knowledge-injection attacks in legal domain. In Proceedings of the AAAI conference on artificial intelligence (Vol. 39, pp. 28106\u201328115).","DOI":"10.1609\/aaai.v39i27.35029"},{"key":"7011_CR26","doi-asserted-by":"crossref","unstructured":"Huang, F., Huang, Q., Zhao, Y., Qi, Z., Wang, B., Huang, Y., & Li, S. (2023). A three-stage framework for event-event relation extraction with large language model. In International conference on neural information processing (pp. 434\u2013446). Springer.","DOI":"10.1007\/978-981-99-8181-6_33"},{"issue":"2","key":"7011_CR27","doi-asserted-by":"publisher","first-page":"806","DOI":"10.1111\/1911-3846.12832","volume":"40","author":"AH Huang","year":"2023","unstructured":"Huang, A.H., Wang, H., & Yang, Y. (2023). Finbert: A large language model for extracting information from financial text. Contemporary Accounting Research, 40(2), 806\u2013841.","journal-title":"Contemporary Accounting Research"},{"key":"7011_CR28","unstructured":"Hui, B., Yang, J., Cui, Z., Yang, J., Liu, D., Zhang, L., Liu, T., Zhang, J., Yu, B., Lu, K., et al. (2024). Qwen2. 5-coder technical report. arXiv preprint. arXiv:2409.12186"},{"key":"7011_CR29","unstructured":"Hurst, A., Lerer, A., Goucher, A.P., Perelman, A., Ramesh, A., Clark, A., Ostrow, A., Welihinda, A., Hayes, A., Radford, A., et al. (2024). Gpt-4o system card. arXiv preprint. arXiv:2410.21276"},{"key":"7011_CR31","doi-asserted-by":"crossref","unstructured":"Jin, D., Jin, Z., Zhou, J. T., & Szolovits, P. (2020). Is Bert really robust? A strong baseline for natural language attack on text classification and entailment. In Proceedings of the AAAI conference on artificial intelligence (Vol. 34, pp. 8018\u20138025).","DOI":"10.1609\/aaai.v34i05.6311"},{"key":"7011_CR30","unstructured":"Jin, Z., Chen, Y., Leeb, F., Gresele, L., Kamal, O., Lyu, Z., Blin, K., Gonzalez Adauto, F., Kleiman-Weiner, M., Sachan, M., et al. (2024). Cladder: A benchmark to assess causal reasoning capabilities of language models. In Advances in neural information processing systems (Vol. 36)."},{"key":"7011_CR32","unstructured":"Jin, Z., Liu, J., Zhiheng, L., Poff, S., Sachan, M., Mihalcea, R., Diab, M. T., & Sch\u00f6lkopf, B. (2024). Can large language models infer causation from correlation? In The twelfth international conference on learning representations."},{"key":"7011_CR33","unstructured":"Jiralerspong, T., Chen, X., More, Y., Shah, V., Bengio, Y. (2024). Efficient causal graph discovery using large language models. In ICLR 2024 workshop: How far are we from AGI."},{"key":"7011_CR34","doi-asserted-by":"crossref","unstructured":"Kang, C., & Choi, J. (2023). Impact of co-occurrence on factualknowledge of large language models. In Findings of the Association for Computational Linguistics: EMNLP 2023 (pp. 7721\u20137735).","DOI":"10.18653\/v1\/2023.findings-emnlp.518"},{"key":"7011_CR99","unstructured":"Kiciman, E., Ness, R., Sharma, A., Tan, C.: Causal reasoning and large language models: Opening a new frontier for causality. Transactions on machine learning research (2024)"},{"key":"7011_CR36","doi-asserted-by":"crossref","unstructured":"Kim, Y., Guo, L., Yu, B., & Li, Y. (2023). Can ChatGPT understand causal language in science claims? In 13th Workshop on computational approaches to subjectivity, sentiment and social media analysis, WASSA 2023 (pp. 379\u2013389). Association for Computational Linguistics (ACL).","DOI":"10.18653\/v1\/2023.wassa-1.33"},{"key":"7011_CR37","doi-asserted-by":"publisher","DOI":"10.2139\/ssrn.4567607","author":"H Li","year":"2023","unstructured":"Li, H., Gao, H., Wu, C., & Vasarhelyi, M. A.(2023). Extracting financial data from unstructured sources: Leveraging large language models. Journal of Information Systems. https:\/\/doi.org\/10.2139\/ssrn.4567607","journal-title":"Journal of Information Systems"},{"key":"7011_CR39","doi-asserted-by":"crossref","unstructured":"Li, J., Ji, S., Du, T., Li, B., & Wang, T. (2019). Textbugger: Generating adversarial text against real-world applications. In 26th Annual network and distributed system security symposium, NDSS 2019. The Internet Society.","DOI":"10.14722\/ndss.2019.23138"},{"key":"7011_CR38","doi-asserted-by":"crossref","unstructured":"Li, Y., Guo, Y., Guerin, F., & Lin, C. (2024). An open-source data contamination report for large language models. In Findings of the Association for Computational Linguistics: EMNLP 2024 (pp. 528\u2013541)","DOI":"10.18653\/v1\/2024.findings-emnlp.30"},{"key":"7011_CR40","doi-asserted-by":"crossref","unstructured":"Liu, C., Chen, Y., Liu, T., Gong, M., Cheng, J., Han, B., & Zhang, K. (2024). Discovery of the hidden world with large language models. arXiv preprint. arXiv:2402.03941","DOI":"10.52202\/079017-3249"},{"key":"7011_CR41","doi-asserted-by":"crossref","unstructured":"Liu, X., Xu, P., Wu, J., Yuan, J., Yang, Y., Zhou, Y., Liu, F., Guan, T., Wang, H., Yu, T., McAuley, J., Ai, W., & Huang, F. (2025). Large language models and causal inference in collaboration: A comprehensive survey. In Findings of the Association for Computational Linguistics: NAACL 2025 (pp. 7668\u20137684).","DOI":"10.18653\/v1\/2025.findings-naacl.427"},{"key":"7011_CR42","unstructured":"Long, S., Pich\u00e9, A., Zantedeschi, V., Schuster, T., & Drouin, A. (2023). Causal discovery with language models as imperfect experts. arXiv preprint. arXiv:2307.02390"},{"key":"7011_CR43","unstructured":"Long, S., Schuster, T., & Pich\u00e9, A. (2022). Can large language models build causal graphs? In NeurIPS 2022 workshop on causal machine learning for real-world impact (CML4Impact 2022)."},{"key":"7011_CR44","doi-asserted-by":"crossref","unstructured":"Lu, Y., Liu, Q., Dai, D., Xiao, X., Lin, H., Han, X., Sun, L., & Wu, H. (2022). Unified structure generation for universal information extraction. In Proceedings of the 60th annual meeting of the Association for Computational Linguistics (Volume 1: Long Papers) (pp. 5755\u20135772).","DOI":"10.18653\/v1\/2022.acl-long.395"},{"key":"7011_CR45","doi-asserted-by":"crossref","unstructured":"Miao, X., Li, Y., & Qian, T. (2023). Generating commonsense counterfactuals for stable relation extraction. In Proceedings of the 2023 conference on empirical methods in natural language processing (pp. 5654\u20135668).","DOI":"10.18653\/v1\/2023.emnlp-main.344"},{"key":"7011_CR46","doi-asserted-by":"crossref","unstructured":"Miao, X., Li, Y., Zhou, S., & Qian, T. (2024). Episodic memory retrieval from LLMS: A neuromorphic mechanism to generate commonsense counterfactuals for relation extraction. In Findings of the Association for Computational Linguistics ACL 2024 (pp. 2489\u20132511).","DOI":"10.18653\/v1\/2024.findings-acl.146"},{"key":"7011_CR47","unstructured":"Monajatipoor, M., Yang, J., Stremmel, J., Emami, M., Mohaghegh, F., Rouhsedaghat, M., & Chang, K.-W. (2024). LLMS in biomedicine: A study on clinical named entity recognition. arXiv preprint. arXiv:2404.07376"},{"key":"7011_CR48","unstructured":"Mondal, I., & Sancheti, A. (2024). How much reliable is chatgpt\u2019s prediction on information extraction under input perturbations? arXiv preprint. arXiv:2404.05088"},{"key":"7011_CR49","unstructured":"Naik, A., Ravichander, A., Sadeh, N., Rose, C., & Neubig, G. (2018). Stress test evaluation for natural language inference. In Proceedings of the 27th international conference on computational linguistics (pp. 2340\u20132353)."},{"key":"7011_CR52","volume-title":"Causality: models, reasoning, and inference","author":"J Pearl","year":"2000","unstructured":"Pearl, J. (2000). Causality: Models, reasoning, and inference. Cambridge University Press."},{"key":"7011_CR51","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511803161","volume-title":"Causality: models, reasoning, and inference","author":"J Pearl","year":"2009","unstructured":"Pearl, J. (2009). Causality: Models, reasoning, and inference. Cambridge University Press."},{"key":"7011_CR50","volume-title":"The book of why: the new science of cause and effect","author":"J Pearl","year":"2018","unstructured":"Pearl, J., & Mackenzie, D. (2018). The book of why: the new science of cause and effect. Basic Books."},{"issue":"8","key":"7011_CR53","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al. (2019). Language models are unsupervised multitask learners. OpenAI Blog, 1(8), 9.","journal-title":"OpenAI Blog"},{"key":"7011_CR54","doi-asserted-by":"crossref","unstructured":"Rajpoot, P., & Parikh, A. (2023). Gpt-finre: In-context learning for financial relation extraction using large language models. In Proceedings of the sixth workshop on financial technology and natural language processing (pp. 42\u201345).","DOI":"10.18653\/v1\/2023.finnlp-2.5"},{"issue":"6","key":"7011_CR55","doi-asserted-by":"publisher","first-page":"140","DOI":"10.1007\/s10994-025-06767-4","volume":"114","author":"S Raza","year":"2025","unstructured":"Raza, S., Bamgbose, O., Ghuge, S., Tavakoli, F., Reji, D. J., & Bashir, S. R. (2025). Developing safe and responsible large language model: can we balance bias reduction and language understanding? Machine Learning, 114(6), 140.","journal-title":"Machine Learning"},{"key":"7011_CR56","doi-asserted-by":"crossref","unstructured":"Reimers, N., & Gurevych, I. (2019). Sentence-Bert: Sentence embeddings using Siamese Bert-networks. In Proceedings of the 2019 conference on empirical methods in natural language processing and the 9th international joint conference on natural language processing (EMNLP-IJCNLP) (pp. 3982\u20133992).","DOI":"10.18653\/v1\/D19-1410"},{"key":"7011_CR57","doi-asserted-by":"crossref","unstructured":"Ren, S., Deng, Y., He, K., & Che, W. (2019). Generating natural language adversarial examples through probability weighted word saliency. In Proceedings of the 57th annual meeting of the Association for Computational Linguistics (pp. 1085\u20131097).","DOI":"10.18653\/v1\/P19-1103"},{"key":"7011_CR58","doi-asserted-by":"crossref","unstructured":"Ribeiro, M.T., Wu, T., Guestrin, C., & Singh, S. (2020). Beyond accuracy: Behavioral testing of nlp models with checklist. In Proceedings of the 58th annual meeting of the Association for Computational Linguistics. Association for Computational Linguistics.","DOI":"10.18653\/v1\/2020.acl-main.442"},{"key":"7011_CR59","doi-asserted-by":"crossref","unstructured":"Romanou, A., Montariol, S., Paul, D., Laugier, L., Aberer, K., & Bosselut, A. (2023). Crab: Assessing the strength of causal relationships between real-world events. In Proceedings of the 2023 conference on empirical methods in natural language processing (pp. 15198\u201315216).","DOI":"10.18653\/v1\/2023.emnlp-main.940"},{"key":"7011_CR60","doi-asserted-by":"crossref","unstructured":"Sakib, M. N., Islam, M. A., Pathak, R., & Arifin, M. M. (2024). Risks, causes, and mitigations of widespread deployments of large language models (LLMS): A survey. In 2024 2nd International conference on artificial intelligence, blockchain, and Internet of Things (AIBThings) (pp. 1\u20137). IEEE.","DOI":"10.1109\/AIBThings63359.2024.10863356"},{"issue":"3","key":"7011_CR61","doi-asserted-by":"publisher","first-page":"261","DOI":"10.1561\/1900000003","volume":"1","author":"S. Sarawagi","year":"2008","unstructured":"Sarawagi, S., et al. (2008). Information extraction. Foundations and Trends\u00ae in Databases, 1(3), 261\u2013377.","journal-title":"Foundations and Trends\u00ae in Databases"},{"issue":"17","key":"7011_CR62","doi-asserted-by":"publisher","first-page":"7782","DOI":"10.3390\/app14177782","volume":"14","author":"S Shahriar","year":"2024","unstructured":"Shahriar, S., Lund, B.D., Mannuru, N. R., Arshad, M. A., Hayawi, K., Bevara, R. V. K., Mannuru, A., & Batool, L. (2024). Putting GPT-4O to the sword: A comprehensive evaluation of language, vision, speech, and multimodal proficiency. Applied Sciences, 14(17), 7782.","journal-title":"Applied Sciences"},{"issue":"267","key":"7011_CR63","first-page":"467","volume":"49","author":"HA Simon","year":"1954","unstructured":"Simon, H. A.(1954). Spurious correlation: A causal interpretation. Journal of the American Statistical Association, 49(267), 467\u2013479.","journal-title":"Journal of the American Statistical Association"},{"key":"7011_CR64","unstructured":"Singh, A., Singh, N., & Vatsal, S. (2024). Robustness of llms to perturbations in text. arXiv preprint. arXiv:2407.08989"},{"key":"7011_CR65","doi-asserted-by":"crossref","unstructured":"Speer, R., Chin, J., & Havasi, C. (2017). Conceptnet 5.5: An open multilingual graph of general knowledge. In Proceedings of the AAAI conference on artificial intelligence (Vol. 31).","DOI":"10.1609\/aaai.v31i1.11164"},{"key":"7011_CR66","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s40535-016-0018-x","volume":"3","author":"P. Spirtes","year":"2016","unstructured":"Spirtes, P., & Zhang, K. (2016). Causal discovery and inference: Concepts and recent methodological advances. Applied Informatics, 3, 1\u201328.","journal-title":"Applied Informatics"},{"key":"7011_CR67","volume-title":"Causation, prediction, and search","author":"P Spirtes","year":"2000","unstructured":"Spirtes, P., Glymour, C. N., & Scheines, R. (2000). Causation, prediction, and search. MIT."},{"key":"7011_CR68","doi-asserted-by":"crossref","unstructured":"Sun, K., Zhang, R., Mensah, S., Mao, Y., & Liu, X. (2020). Recurrent interaction network for jointly extracting entities and classifying relations. arXiv preprint. arXiv:2005.00162","DOI":"10.18653\/v1\/2020.emnlp-main.304"},{"key":"7011_CR69","doi-asserted-by":"crossref","unstructured":"Tiwari, K., Yuan, S., & Zhang, L. (2022). Robust hate speech detection via mitigating spurious correlations. In Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th international joint conference on natural language processing (Volume 2: Short Papers) (pp. 51\u201356).","DOI":"10.18653\/v1\/2022.aacl-short.7"},{"key":"7011_CR70","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, \u0141., & Polosukhin, I. (2017). Attention is all you need. In Advances in neural information processing systems (Vol. 30)."},{"key":"7011_CR71","unstructured":"Verma, T., & Pearl, J. Equivalence and synthesis of causal models. In: Proceedings of the Sixth Annual Conference on Uncertainty in Artificial Intelligence, pp. 255\u2013270 (1990)."},{"key":"7011_CR72","unstructured":"Villalobos, P., Ho, A., Sevilla, J., Besiroglu, T., Heim, L., & Hobbhahn, M. (2024). Position: Will we run out of data? Limits of LLM scaling based on human-generated data. In International conference on machine learning (pp. 49523\u201349544). PMLR."},{"key":"7011_CR73","doi-asserted-by":"crossref","unstructured":"Wan, Z., Cheng, F., Mao, Z., Liu, Q., Song, H., Li, J., & Kurohashi, S. (2023). GPT-RE: In-context learning for relation extraction using large language models. In Proceedings of the 2023 conference on empirical methods in natural language processing (pp. 3534\u20133547).","DOI":"10.18653\/v1\/2023.emnlp-main.214"},{"key":"7011_CR74","unstructured":"Wan, G., Wu, Y., Hu, M., Chu, Z., & Li, S. (2024). Bridging causal discovery and large language models: A comprehensive survey of integrative approaches and future directions. arXiv preprint. arXiv:2402.11068"},{"key":"7011_CR78","unstructured":"Wang, S., Sun, X., Li, X., Ouyang, R., Wu, F., Zhang, T., Li, J., & Wang, G. (2023). GPT-NER: Named entity recognition via large language models. arXiv preprint. arXiv:2304.10428"},{"key":"7011_CR77","doi-asserted-by":"crossref","unstructured":"Wang, Q., Ding, K., Liang, B., Yang, M., & Xu, R. (2023). Reducing spurious correlations in aspect-based sentiment analysis with explanation from large language models. In Findings of the Association for Computational Linguistics: EMNLP 2023 (pp. 2930\u20132941)","DOI":"10.18653\/v1\/2023.findings-emnlp.193"},{"key":"7011_CR76","doi-asserted-by":"crossref","unstructured":"Wang, Y., Chen, M., Zhou, W., Cai, Y., Liang, Y., Liu, D., Yang, B., Liu, J., & Hooi, B. (2022). Should we rely on entity mentions for relation extraction? debiasing relation extraction with counterfactual analysis. In Proceedings of the 2022 conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (pp. 3071\u20133081).","DOI":"10.18653\/v1\/2022.naacl-main.224"},{"key":"7011_CR75","unstructured":"Wang, Y., & Zhao, Y. (2024). RUPBENCH: Benchmarking reasoning under perturbations for robustness evaluation in large language models. arXiv preprint. arXiv:2406.11020"},{"key":"7011_CR79","unstructured":"Willig, M., Ze\u010devi\u0107, M., Dhami, D. S., & Kersting, K. (2023). Probing for correlations of causal facts: Large language models and causality. In The 11th International conference on learning representations (ICLR 2023)."},{"key":"7011_CR80","unstructured":"Wu, A., Kuang, K., Zhu, M., Wang, Y., Zheng, Y., Han, K., Li, B., Chen, G., Wu, F., & Zhang, K. (2024). Causality for large language models. CoRR. arXiv:2410.15319 [cs.CL]"},{"key":"7011_CR81","unstructured":"Wu, S., Li, D., Ye, H., Chen, Z., Zhou, J., Lou, J., Zheng, Z., & Ng, S.-K. (2025). Tsrating: Rating quality of diverse time series data by meta-learning from LLM judgment. arXiv preprint. arXiv:2506.01290"},{"key":"7011_CR82","doi-asserted-by":"crossref","unstructured":"Wu, J., Yu, T., Chen, X., Wang, H., Rossi, R., Kim, S., Rao, A., & McAuley, J. (2024) DECOT: Debiasing chain-of-thought for knowledge-intensive tasks in large language models via causal intervention. In Proceedings of the 62nd annual meeting of the Association for Computational Linguistics (Volume 1: Long Papers) (pp. 14073\u201314087).","DOI":"10.18653\/v1\/2024.acl-long.758"},{"key":"7011_CR84","unstructured":"Xiao, M., Xiao, Y., Ji, S., Li, Y., Xue, L., & Zhang, P. (2025). ABFS: Natural robustness testing for llm-based nlp software. arXiv preprint. arXiv:2503.01319"},{"key":"7011_CR83","doi-asserted-by":"crossref","unstructured":"Xiao, Y., Hu, Y., Choo, K., & Lee, R. (2024). TOXICLOAKCN: Evaluating robustness of offensive language detection in chinese with cloaking perturbations. In Proceedings of the 2024 conference on empirical methods in natural language processing (pp. 6012\u20136025).","DOI":"10.18653\/v1\/2024.emnlp-main.345"},{"key":"7011_CR85","unstructured":"Yang, A., Li, A., Yang, B., Zhang, B., Hui, B., Zheng, B., Yu, B., Gao, C., Huang, C., Lv, C., et al. (2025). QWEN3 technical report. arXiv preprint. arXiv:2505.09388"},{"key":"7011_CR86","unstructured":"Ye, W., Zheng, G., Cao, X., Ma, Y., & Zhang, A. (2024). Spurious correlations in machine learning: A survey. arXiv preprint. arXiv:2402.12715"},{"key":"7011_CR87","unstructured":"Yu, T., Jing, Y., Zhang, X., Jiang, W., Wu, W., Wang, Y., Hu, W., Du, B., & Tao, D. (2025). Benchmarking reasoning robustness in large language models. arXiv preprint. arXiv:2503.04550"},{"key":"7011_CR88","doi-asserted-by":"crossref","unstructured":"Yu, T., Yang, M., Li, C., & Xu, R. (2023). Reducing spurious correlations for relation extraction by feature decomposition and semantic augmentation. In Proceedings of the 46th international ACM SIGIR conference on research and development in information retrieval (pp. 2324\u20132328).","DOI":"10.1145\/3539618.3592050"},{"issue":"8","key":"7011_CR89","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s10994-025-06811-3","volume":"114","author":"J Yuan","year":"2025","unstructured":"Yuan, J., Zheng, H., Yu, H., & Luo, X. (2025). Entangle-then-disentangle: A novel approach for enhancing large vision-language model. Machine Learning, 114(8), 1\u201328.","journal-title":"Machine Learning"},{"issue":"8","key":"7011_CR90","first-page":"1","volume":"2023","author":"M Zecevic","year":"2023","unstructured":"Zecevic, M., Willig, M., Dhami, D. S., & Kersting, K. (2023). Causal parrots: Large language models may talk causality but are not causal. Transactions on Machine Learning Research, 2023(8), 1\u201327.","journal-title":"Transactions on Machine Learning Research"},{"key":"7011_CR93","doi-asserted-by":"crossref","unstructured":"Zhang, C., Zhang, L., Wu, J., He, Y., & Zhou, D. (2025). Causal prompting: Debiasing large language model prompting based on front-door adjustment. In Proceedings of the AAAI conference on artificial intelligence (Vol. 39, pp. 25842\u201325850).","DOI":"10.1609\/aaai.v39i24.34777"},{"key":"7011_CR92","doi-asserted-by":"crossref","unstructured":"Zhang, M., Qian, T., Zhang, T., & Miao, X. (2023). Towards model robustness: Generating contextual counterfactuals for entities in relation extraction. In Proceedings of the ACM web conference 2023 (pp. 1832\u20131842).","DOI":"10.1145\/3543507.3583504"},{"key":"7011_CR91","doi-asserted-by":"crossref","unstructured":"Zhang, W., Lu, W., Wang, J., Wang, Y., Chen, L., Jiang, H., Liu, J., & Ruan, T. (2024). Unexpected phenomenon: LLMS\u2019 spurious associations in information extraction. In Findings of the Association for Computational Linguistics ACL 2024 (pp. 9176\u20139190).","DOI":"10.18653\/v1\/2024.findings-acl.545"},{"key":"7011_CR94","doi-asserted-by":"crossref","unstructured":"Zhao, B., Zhang, Y., Xu, Z., Ren, Y., Zhang, X., Luo, R., Feng, Z., & Xia, F. (2025). Unbiased reasoning for knowledge-intensive tasks in large language models via conditional front-door adjustment. arXiv preprint arXiv:2508.16910","DOI":"10.1145\/3746252.3761103"},{"key":"7011_CR95","unstructured":"Zhao, W. X., Zhou, K., Li, J., Tang, T., Wang, X., Hou, Y., Min, Y., Zhang, B., Zhang, J., Dong, Z., et al. (2023). A survey of large language models. arXiv e-prints, 2303."},{"key":"7011_CR96","doi-asserted-by":"crossref","unstructured":"Zheng, J., Ritter, A., & Xu, W. (2024). Neo-bench: Evaluating robustness of large language models with neologisms. In Proceedings of the 62nd annual meeting of the Association for Computational Linguistics (Volume 1: Long Papers), (pp. 13885\u201313906).","DOI":"10.18653\/v1\/2024.acl-long.749"},{"key":"7011_CR97","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Xu, P., Liu, X., An, B., Ai, W., & Huang, F. (2024). Explore spurious correlations at the concept level in language models for text classification. In Proceedings of the 62nd annual meeting of the Association for Computational Linguistics (Volume 1: Long Papers) (pp. 478\u2013492).","DOI":"10.18653\/v1\/2024.acl-long.28"},{"key":"7011_CR98","doi-asserted-by":"crossref","unstructured":"Zhu, K., Wang, J., Zhou, J., Wang, Z., Chen, H., Wang, Y., Yang, L., Ye, W., Zhang, Y., Gong, N., et al. (2023). Promptrobust: Towards evaluating the robustness of large language models on adversarial prompts. In Proceedings of the 1st ACM workshop on large AI systems and models with privacy and safety analysis (pp. 57\u201368).","DOI":"10.1145\/3689217.3690621"}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-026-07011-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10994-026-07011-3","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-026-07011-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:32:57Z","timestamp":1778081577000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10994-026-07011-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,9]]},"references-count":98,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["7011"],"URL":"https:\/\/doi.org\/10.1007\/s10994-026-07011-3","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,9]]},"assertion":[{"value":"14 July 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 October 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 February 2026","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 February 2026","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"This study did not involve any human participants or animals. Ethical approval and informed consent were therefore not required.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical Approval and Consent to Participate"}},{"value":"Not applicable. This study does not contain any individual person\u2019s data in any form (including individual details, images, or videos).","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for Publication"}}],"article-number":"59"}}