{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T09:27:06Z","timestamp":1775813226726,"version":"3.50.1"},"reference-count":59,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62272024"],"award-info":[{"award-number":["62272024"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1007\/s10489-026-07102-7","type":"journal-article","created":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T08:28:04Z","timestamp":1775809684000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["CASET: a cascaded attention-based framework for semantic explainability of toxicity in large language models"],"prefix":"10.1007","volume":"56","author":[{"given":"Chen","family":"Chen","sequence":"first","affiliation":[]},{"given":"Hanyang","family":"Xia","sequence":"additional","affiliation":[]},{"given":"Weidong","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Chunhe","family":"Xia","sequence":"additional","affiliation":[]},{"given":"Mengyao","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Rui","family":"Hao","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0227-9557","authenticated-orcid":false,"given":"Tianbo","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,4,10]]},"reference":[{"key":"7102_CR1","doi-asserted-by":"publisher","unstructured":"Ma X, Gao Y, Wang Y, Wang R, Wang X, Sun Y, Ding Y, Xu H, Chen Y, Zhao Y et al (2025) Safety at scale: a comprehensive survey of large model safety. arXiv:2502.05206. https:\/\/doi.org\/10.48550\/arXiv.2502.05206","DOI":"10.48550\/arXiv.2502.05206"},{"key":"7102_CR2","doi-asserted-by":"publisher","unstructured":"Welbl J, Glaese A, Uesato J, Dathathri S, Mellor J, Hendricks LA, Anderson K, Kohli P, Coppin B, Huang P-S (2021) Challenges in detoxifying language models. In: Findings of the association for computational linguistics: EMNLP 2021, pp 2447\u20132469. https:\/\/doi.org\/10.18653\/v1\/2021.findings-emnlp.210","DOI":"10.18653\/v1\/2021.findings-emnlp.210"},{"key":"7102_CR3","doi-asserted-by":"publisher","unstructured":"Luong T, Le T-T, Ngo L, Nguyen T (2024) Realistic evaluation of toxicity in large language models. In: Findings of the association for computational linguistics: ACL 2024, pp 1038\u20131047. https:\/\/doi.org\/10.18653\/v1\/2024.findings-acl.61","DOI":"10.18653\/v1\/2024.findings-acl.61"},{"key":"7102_CR4","doi-asserted-by":"publisher","unstructured":"Shen T, Jin R, Huang Y, Liu C, Dong W, Guo Z, Wu X, Liu Y, Xiong D (2023) Large language model alignment: a survey. arXiv:2309.15025. https:\/\/doi.org\/10.48550\/arXiv.2403.08946","DOI":"10.48550\/arXiv.2403.08946"},{"key":"7102_CR5","doi-asserted-by":"publisher","unstructured":"Wu X, Zhao H, Zhu Y, Shi Y, Yang F, Hu L, Liu T, Zhai X, Yao W, Li J et al (2024) Usable xai: 10 strategies towards exploiting explainability in the llm era. arXiv:2403.08946. https:\/\/doi.org\/10.48550\/arXiv.2403.08946","DOI":"10.48550\/arXiv.2403.08946"},{"issue":"2","key":"7102_CR6","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3639372","volume":"15","author":"H Zhao","year":"2024","unstructured":"Zhao H, Chen H, Yang F, Liu N, Deng H, Cai H, Wang S, Yin D, Du M (2024) Explainability for large language models: a survey. ACM Trans Intell Syst Technol 15(2):1\u201338. https:\/\/doi.org\/10.1145\/3639372","journal-title":"ACM Trans Intell Syst Technol"},{"key":"7102_CR7","doi-asserted-by":"publisher","unstructured":"Yang Z, Tullo D, Rabbany R (2024) Toxisight: insights towards detected chat toxicity. In: The 7th BlackboxNLP workshop, pp 1\u20133. https:\/\/doi.org\/10.48448\/drhp-d894","DOI":"10.48448\/drhp-d894"},{"key":"7102_CR8","doi-asserted-by":"publisher","unstructured":"Bereska L, Gavves S (2024) Mechanistic interpretability for AI safety - a review. Trans Mach Learn Res, 2835\u20138856. https:\/\/doi.org\/10.48550\/arXiv.2404.14082","DOI":"10.48550\/arXiv.2404.14082"},{"key":"7102_CR9","doi-asserted-by":"publisher","unstructured":"Mahajan A, Shah D, Jafar G (2021) Explainable ai approach towards toxic comment classification. In: Emerging technologies in data mining and information security: proceedings of IEMIS 2020, pp 849\u2013858. https:\/\/doi.org\/10.1007\/978-981-33-4367-2_81","DOI":"10.1007\/978-981-33-4367-2_81"},{"issue":"3","key":"7102_CR10","doi-asserted-by":"publisher","first-page":"1","DOI":"10.55041\/IJSREM29433","volume":"8","author":"MJ Dias","year":"2024","unstructured":"Dias MJ, Tawde PD (2024) Enhancing transparency and interpretability in toxic comment classification: a study on the integration of explainable artificial intelligence (xai) techniques. Int J Sci Res Eng Manag (IJSREM) 8(3):1\u20138. https:\/\/doi.org\/10.55041\/IJSREM29433","journal-title":"Int J Sci Res Eng Manag (IJSREM)"},{"key":"7102_CR11","doi-asserted-by":"publisher","unstructured":"Sarker J, Sultana S, Wilson SR, Bosu A (2023) Toxispanse: an explainable toxicity detection in code review comments. In: 2023 ACM\/IEEE International Symposium on Empirical Software Engineering and Measurement (ESEM), pp 1\u201312. https:\/\/doi.org\/10.1109\/ESEM56168.2023.10304855","DOI":"10.1109\/ESEM56168.2023.10304855"},{"issue":"2","key":"7102_CR12","doi-asserted-by":"publisher","first-page":"495","DOI":"10.1007\/s10844-022-00726-4","volume":"60","author":"KB Nelatoori","year":"2023","unstructured":"Nelatoori KB, Kommanti HB (2023) Multi-task learning for toxic comment classification and rationale extraction. J Intell Inf Syst 60(2):495\u2013519. https:\/\/doi.org\/10.1007\/s10844-022-00726-4","journal-title":"J Intell Inf Syst"},{"issue":"2","key":"7102_CR13","doi-asserted-by":"publisher","first-page":"144","DOI":"10.56472\/25832646\/JETA-V3I6P110","volume":"3","author":"P Vadlapati","year":"2023","unstructured":"Vadlapati P (2023) Investigating the impact of linguistic errors of prompts on llm accuracy. ESP J Eng Technol Adv 3(2):144\u2013147. https:\/\/doi.org\/10.56472\/25832646\/JETA-V3I6P110","journal-title":"ESP J Eng Technol Adv"},{"key":"7102_CR14","doi-asserted-by":"publisher","unstructured":"Li X, Zhou Z, Zhu J, Yao J, Liu T, Han B (2024) Deepinception: hypnotize large language model to be jailbreaker. In: Neurips safe generative AI workshop 2024, pp 1\u201365. https:\/\/doi.org\/10.48550\/arXiv.2311.03191","DOI":"10.48550\/arXiv.2311.03191"},{"key":"7102_CR15","doi-asserted-by":"publisher","unstructured":"Amara K, Sevastjanova R, El-Assady M (2025) Concept-level explainability for auditing & steering llm responses. arXiv:2505.07610. https:\/\/doi.org\/10.48550\/arXiv.2505.07610","DOI":"10.48550\/arXiv.2505.07610"},{"key":"7102_CR16","doi-asserted-by":"publisher","unstructured":"Si WM, Backes M, Blackburn J, De\u00a0Cristofaro E, Stringhini G, Zannettou S, Zhang Y (2022) Why so toxic? measuring and triggering toxic behavior in open-domain chatbots. In: Proceedings of the 2022 ACM SIGSAC conference on computer and communications security, pp 2659\u20132673. https:\/\/doi.org\/10.1145\/3548606.3560599","DOI":"10.1145\/3548606.3560599"},{"key":"7102_CR17","doi-asserted-by":"publisher","unstructured":"Jain D, Kumar P, Gehman S, Zhou X, Hartvigsen T, Sap M (2024) Polyglotoxicityprompts: multilingual evaluation of neural toxic degeneration in large language models. arXiv:2405.09373. https:\/\/doi.org\/10.48550\/arXiv.2405.09373","DOI":"10.48550\/arXiv.2405.09373"},{"issue":"2","key":"7102_CR18","doi-asserted-by":"publisher","first-page":"433","DOI":"10.28991\/HIJ-2025-06-02-05","volume":"6","author":"S Zhang","year":"2025","unstructured":"Zhang S, Fan X, Song B, Liang X, Zhang Q, Wang Z, Zhang B (2025) Research on RAG-based cognitive large language model training method for power standard knowledge. HighTech Innov J 6(2):433\u2013443. https:\/\/doi.org\/10.28991\/HIJ-2025-06-02-05","journal-title":"HighTech Innov J"},{"issue":"2","key":"7102_CR19","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3710959","volume":"9","author":"Y Li","year":"2025","unstructured":"Li Y, Zhang P, Gu H, Lu T, Qiao S, Shu Y, Shao Y, Gu N (2025) Demod: a holistic tool with explainable detection and personalized modification for toxicity censorship. Proc ACM Human-Comput Interact 9(2):1\u201324. https:\/\/doi.org\/10.1145\/3710959","journal-title":"Proc ACM Human-Comput Interact"},{"key":"7102_CR20","doi-asserted-by":"publisher","unstructured":"Horovicz M, Goldshmidt R (2024) Tokenshap: interpreting large language models with monte carlo shapley value estimation. In: Proceedings of the 1st workshop on NLP for science (NLP4Science), pp 1\u20138. https:\/\/doi.org\/10.18653\/v1\/2024.nlp4science-1.1","DOI":"10.18653\/v1\/2024.nlp4science-1.1"},{"key":"7102_CR21","doi-asserted-by":"publisher","unstructured":"Zhang B, Shen X, Si WM, Sha Z, Chen Z, Salem A, Shen Y, Backes M, Zhang Y (2023) Comprehensive assessment of toxicity in chatgpt. arXiv:2311.14685. https:\/\/doi.org\/10.48550\/arXiv.2311.14685","DOI":"10.48550\/arXiv.2311.14685"},{"key":"7102_CR22","doi-asserted-by":"publisher","unstructured":"Shen X, Chen Z, Backes M, Shen Y, Zhang Y (2024) \u201cdo anything now\u201d: characterizing and evaluating in-the-wild jailbreak prompts on large language models. In: Proceedings of the 2023 conference on empirical methods in natural language processing, pp 1671\u20131685. https:\/\/doi.org\/10.1145\/3658644.3670388","DOI":"10.1145\/3658644.3670388"},{"key":"7102_CR23","doi-asserted-by":"publisher","unstructured":"Esiobu D, Tan X, Hosseini S, Ung M, Zhang Y, Fernandes J, Dwivedi-Yu J, Presani E, Williams A, Smith EM (2023) Robbie: robust bias evaluation of large generative language models. In: The 2023 conference on empirical methods in natural language processing, pp 3764\u20133814. https:\/\/doi.org\/10.18653\/v1\/2023.emnlp-main.230","DOI":"10.18653\/v1\/2023.emnlp-main.230"},{"key":"7102_CR24","doi-asserted-by":"publisher","unstructured":"Cui S, Feng X, Wang Y, Yang J, Zhang Z, Sikdar B, Wang H, Qiu H, Huang M (2025) When smiley turns hostile: interpreting how emojis trigger llms\u2019 toxicity. arXiv:2509.11141. https:\/\/doi.org\/10.48550\/arXiv.2509.11141","DOI":"10.48550\/arXiv.2509.11141"},{"key":"7102_CR25","doi-asserted-by":"publisher","unstructured":"Dementieva D, Babakov N, Ronen A, Ayele AA, Rizwan N, Schneider F, Wang X, Yimam SM, Moskovskiy DA, Stakovskii E et al (2025) Multilingual and explainable text detoxification with parallel corpora. In: Proceedings of the 31st international conference on computational linguistics, pp 7998\u20138025. https:\/\/doi.org\/10.48550\/arXiv.2412.11691","DOI":"10.48550\/arXiv.2412.11691"},{"key":"7102_CR26","unstructured":"Zhang Q, Qiu H, Wang D, Li Y, Zhang T, Zhu W, Weng H, Yan L, Zhang C (2025) A benchmark for semantic sensitive information in llms outputs. In: The thirteenth international conference on learning representations, pp 1\u201328"},{"issue":"2","key":"7102_CR27","doi-asserted-by":"publisher","first-page":"1037","DOI":"10.28991\/ESJ-2025-09-02-027","volume":"9","author":"MF Ishrak","year":"2025","unstructured":"Ishrak MF, Rahman MM, Joy MIK, Tamuly A, Akter S, Tanim DM, Jawar S, Ahmed N, Rahman MS (2025) Vision transformer embedded feature fusion model with pre-trained transformers for keratoconus disease classification. Emerg Sci J 9(2):1037\u20131075. https:\/\/doi.org\/10.28991\/ESJ-2025-09-02-027","journal-title":"Emerg Sci J"},{"issue":"2","key":"7102_CR28","doi-asserted-by":"publisher","first-page":"916","DOI":"10.28991\/ESJ-2025-09-02-021","volume":"9","author":"MA Hossain","year":"2025","unstructured":"Hossain MA, Asa TA, Mahmud MZ, Azad A, Rahman MZ, Moni MA, Moustafa A (2025) Genetic links between common lung diseases and lung cancer progression: bioinformatics and machine learning insights. Emerg Sci J 9(2):916\u2013937. https:\/\/doi.org\/10.28991\/ESJ-2025-09-02-021","journal-title":"Emerg Sci J"},{"key":"7102_CR29","doi-asserted-by":"publisher","unstructured":"Nori H, Jenkins S, Koch P, Caruana R (2019) Interpretml: a unified framework for machine learning interpretability. arXiv:1909.09223. https:\/\/doi.org\/10.48550\/arXiv.1909.09223","DOI":"10.48550\/arXiv.1909.09223"},{"key":"7102_CR30","doi-asserted-by":"publisher","unstructured":"Ribeiro MT, Singh S, Guestrin C (2016) \u201cwhy should i trust you?\u201d explaining the predictions of any classifier. In: Proceedings of the 22nd ACM SIGKDD international conference on knowledge discovery and data mining, pp 1135\u20131144. https:\/\/doi.org\/10.1145\/2939672.2939778","DOI":"10.1145\/2939672.2939778"},{"key":"7102_CR31","doi-asserted-by":"publisher","unstructured":"Lundberg SM, Lee S-I (2017) A unified approach to interpreting model predictions. In: Proceedings of the 31st international conference on neural information processing systems, pp 4768\u20134777. https:\/\/doi.org\/10.48550\/arXiv.1705.07874","DOI":"10.48550\/arXiv.1705.07874"},{"issue":"2","key":"7102_CR32","doi-asserted-by":"publisher","first-page":"161","DOI":"10.2307\/1269043","volume":"33","author":"MD Morris","year":"1991","unstructured":"Morris MD (1991) Factorial sampling plans for preliminary computational experiments. Technometrics 33(2):161\u2013174. https:\/\/doi.org\/10.2307\/1269043","journal-title":"Technometrics"},{"key":"7102_CR33","doi-asserted-by":"publisher","unstructured":"Zhou B, Khosla A, Lapedriza A, Oliva A, Torralba A (2016) Learning deep features for discriminative localization. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 2921\u20132929. https:\/\/doi.org\/10.1109\/CVPR.2016.319","DOI":"10.1109\/CVPR.2016.319"},{"key":"7102_CR34","doi-asserted-by":"publisher","unstructured":"Selvaraju RR, Cogswell M, Das A, Vedantam R, Parikh D, Batra D (2017) Grad-cam: visual explanations from deep networks via gradient-based localization. In: Proceedings of the IEEE international conference on computer vision, pp 618\u2013626. https:\/\/doi.org\/10.1007\/s11263-019-01228-7","DOI":"10.1007\/s11263-019-01228-7"},{"issue":"3","key":"7102_CR35","doi-asserted-by":"publisher","first-page":"31","DOI":"10.1145\/3236386.3241340","volume":"16","author":"ZC Lipton","year":"2018","unstructured":"Lipton ZC (2018) The mythos of model interpretability: in machine learning, the concept of interpretability is both important and slippery. Queue 16(3):31\u201357. https:\/\/doi.org\/10.1145\/3236386.3241340","journal-title":"Queue"},{"key":"7102_CR36","doi-asserted-by":"publisher","first-page":"227","DOI":"10.1016\/j.patrec.2019.12.020","volume":"131","author":"S Zheng","year":"2020","unstructured":"Zheng S, Ding C (2020) A group lasso based sparse knn classifier. Pattern Recogn Lett 131:227\u2013233. https:\/\/doi.org\/10.1016\/j.patrec.2019.12.020","journal-title":"Pattern Recogn Lett"},{"issue":"8","key":"7102_CR37","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3546577","volume":"55","author":"A Madsen","year":"2022","unstructured":"Madsen A, Reddy S, Chandar S (2022) Post-hoc interpretability for neural nlp: a survey. ACM Comput Surv 55(8):1\u201342. https:\/\/doi.org\/10.1145\/3546577","journal-title":"ACM Comput Surv"},{"key":"7102_CR38","unstructured":"Vig J (2019) Bertviz: a tool for visualizing multihead self-attention in the bert model. In: ICLR workshop: debugging machine learning models, vol 3, pp 1\u20136"},{"issue":"1","key":"7102_CR39","doi-asserted-by":"publisher","first-page":"976","DOI":"10.1109\/TVCG.2021.3114683","volume":"28","author":"T Jaunet","year":"2021","unstructured":"Jaunet T, Kervadec C, Vuillemot R, Antipov G, Baccouche M, Wolf C (2021) Visqa: X-raying vision and language reasoning in transformers. IEEE Trans Visual Comput Graphics 28(1):976\u2013986. https:\/\/doi.org\/10.1109\/TVCG.2021.3114683","journal-title":"IEEE Trans Visual Comput Graphics"},{"key":"7102_CR40","doi-asserted-by":"publisher","unstructured":"Yu Y, Liu N, Lu F, Gao T, Jafarzadeh S, Silling S (2024) Nonlocal attention operator: materializing hidden knowledge towards interpretable physics discovery. In: Proceedings of the 38th international conference on neural information processing systems, pp 113797\u2013113822. https:\/\/doi.org\/10.52202\/079017-3613","DOI":"10.52202\/079017-3613"},{"key":"7102_CR41","doi-asserted-by":"publisher","unstructured":"Torres F, Zhang H, Sicre R, Ayache S, Avrithis Y (2024) Ca-stream: attention-based pooling for interpretable image recognition. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 8206\u20138211. https:\/\/doi.org\/10.48550\/arXiv.2404.14996","DOI":"10.48550\/arXiv.2404.14996"},{"issue":"1","key":"7102_CR42","doi-asserted-by":"publisher","first-page":"262","DOI":"10.1109\/TVCG.2023.3327163","volume":"30","author":"C Yeh","year":"2023","unstructured":"Yeh C, Chen Y, Wu A, Chen C, Vi\u00e9gas F, Wattenberg M (2023) Attentionviz: a global view of transformer attention. IEEE Trans Visual Comput Graphics 30(1):262\u2013272. https:\/\/doi.org\/10.1109\/TVCG.2023.3327163","journal-title":"IEEE Trans Visual Comput Graphics"},{"key":"7102_CR43","doi-asserted-by":"publisher","unstructured":"Leong CT, Cheng Y, Wang J, Wang J, Li W (2023) Self-detoxifying language models via toxification reversal. In: Proceedings of the 2023 conference on empirical methods in natural language processing, pp 4433\u20134449. https:\/\/doi.org\/10.18653\/v1\/2023.emnlp-main.269","DOI":"10.18653\/v1\/2023.emnlp-main.269"},{"issue":"14","key":"7102_CR44","doi-asserted-by":"publisher","first-page":"3048","DOI":"10.1080\/00949655.2025.2516793","volume":"95","author":"S Ahmad","year":"2025","unstructured":"Ahmad S (2025) Wang H (2025) Tv-ccanm: a transformer variational inference in confounding cascade additive noise model for causal effect estimation. J Stat Comput Simul 95(14):3048\u20133076. https:\/\/doi.org\/10.1080\/00949655.2025.2516793","journal-title":"J Stat Comput Simul"},{"key":"7102_CR45","doi-asserted-by":"publisher","unstructured":"Dong M, Zhang J, Zheng B, Tu X, Hu P, He T (2025) Dscd: large language model detoxification with self-constrained decoding. In: Proceedings of the 2025 conference on empirical methods in natural language processing, pp 3969\u20133984. https:\/\/doi.org\/10.18653\/v1\/2025.emnlp-main.197","DOI":"10.18653\/v1\/2025.emnlp-main.197"},{"key":"7102_CR46","doi-asserted-by":"publisher","unstructured":"Wang H, Yue Y, Lu R, Shi J, Zhao A, Wang S, Song S, Huang G (2025) Model surgery: modulating llm\u2019s behavior via simple parameter editing. In: Proceedings of the 2025 conference of the nations of the americas chapter of the association for computational linguistics: human language technologies (Volume 1: Long Papers), pp 6337\u20136357. https:\/\/doi.org\/10.18653\/v1\/2025.naacl-long.321","DOI":"10.18653\/v1\/2025.naacl-long.321"},{"key":"7102_CR47","doi-asserted-by":"publisher","unstructured":"Wang M, Zhang N, Xu Z, Xi Z, Deng S, Yao Y, Zhang Q, Yang L, Wang J, Chen H (2024) Detoxifying large language models via knowledge editing. In: Proceedings of the 62nd annual meeting of the association for computational linguistics (Volume 1: Long Papers), pp 3093\u20133118. https:\/\/doi.org\/10.18653\/v1\/2024.acl-long.171","DOI":"10.18653\/v1\/2024.acl-long.171"},{"key":"7102_CR48","unstructured":"Chomsky N (1970) Remarks on nominalization. Read English Transform Grammar, 184\u2013221"},{"key":"7102_CR49","doi-asserted-by":"publisher","first-page":"117020","DOI":"10.1016\/j.cam.2025.117020","volume":"475","author":"S Ahmad","year":"2025","unstructured":"Ahmad S, Shah K, Debbouche A (2025) Structural equation modelling for causal effect estimation with machine learning. J Comput Appl Math 475:117020. https:\/\/doi.org\/10.1016\/j.cam.2025.117020","journal-title":"J Comput Appl Math"},{"key":"7102_CR50","doi-asserted-by":"publisher","unstructured":"Chen H, Zheng G, Ji Y (2020) Generating hierarchical explanations on text classification via feature interaction detection. In: Proceedings of the 58th annual meeting of the association for computational linguistics, pp 5578\u20135593. https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.494","DOI":"10.18653\/v1\/2020.acl-main.494"},{"key":"7102_CR51","doi-asserted-by":"publisher","unstructured":"Andriushchenko M, Flammarion N (2024) Does refusal training in LLMs generalize to the past tense? In: Neurips safe generative AI workshop 2024, pp 1\u201314. https:\/\/doi.org\/10.48550\/arXiv.2407.11969","DOI":"10.48550\/arXiv.2407.11969"},{"key":"7102_CR52","doi-asserted-by":"publisher","unstructured":"Amara K, Sevastjanova R, El-Assady M (2024) Syntaxshap: syntax-aware explainability method for text generation. In: Findings of the association for computational linguistics ACL 2024, pp 4551\u20134566. https:\/\/doi.org\/10.18653\/v1\/2024.findings-acl.270","DOI":"10.18653\/v1\/2024.findings-acl.270"},{"key":"7102_CR53","doi-asserted-by":"publisher","unstructured":"Gehman S, Gururangan S, Sap M, Choi Y, Smith NA (2020) Realtoxicityprompts: evaluating neural toxic degeneration in language models. In: Findings of the association for computational linguistics: EMNLP 2020, pp 3356\u20133369. https:\/\/doi.org\/10.18653\/v1\/2020.findings-emnlp.301","DOI":"10.18653\/v1\/2020.findings-emnlp.301"},{"key":"7102_CR54","doi-asserted-by":"publisher","unstructured":"Wang B, Chen W, Pei H, Xie C, Kang M, Zhang C, Xu C, Xiong Z, Dutta R, Schaeffer R et al (2023) Decodingtrust: a comprehensive assessment of trustworthiness in gpt models. In: Proceedings of the 37th international conference on neural information processing systems, pp 31232\u201331339. https:\/\/doi.org\/10.48550\/arXiv.2306.11698","DOI":"10.48550\/arXiv.2306.11698"},{"key":"7102_CR55","doi-asserted-by":"publisher","unstructured":"Dinan E, Abercrombie G, Bergman AS, Spruit S, Hovy D, Boureau Y-L, Rieser V (2021) Anticipating safety issues in e2e conversational ai: framework and tooling. arXiv:2107.03451. https:\/\/doi.org\/10.48550\/arXiv.2107.03451","DOI":"10.48550\/arXiv.2107.03451"},{"key":"7102_CR56","doi-asserted-by":"publisher","unstructured":"Chen C, Shen J, Deng Z, Lei L (2025) Conformal tail risk control for large language model alignment. In: Forty-second international conference on machine learning, pp 1\u201324. https:\/\/doi.org\/10.48550\/arXiv.2502.20285","DOI":"10.48550\/arXiv.2502.20285"},{"issue":"12","key":"7102_CR57","doi-asserted-by":"publisher","first-page":"855","DOI":"10.1007\/s10489-025-06738-1","volume":"55","author":"S Ahmad","year":"2025","unstructured":"Ahmad S, Wang H (2025) Transformer-variational autoencoder for estimating individual treatment effect using causal inference framework. Appl Intell 55(12):855. https:\/\/doi.org\/10.1007\/s10489-025-06738-1","journal-title":"Appl Intell"},{"key":"7102_CR58","doi-asserted-by":"publisher","unstructured":"Jain S, Wallace BC (2019) Attention is not explanation. In: Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, Volume 1 (Long and Short Papers), pp 3543\u20133556. https:\/\/doi.org\/10.18653\/v1\/N19-1357","DOI":"10.18653\/v1\/N19-1357"},{"key":"7102_CR59","doi-asserted-by":"publisher","unstructured":"Wiegreffe S, Pinter Y (2019) Attention is not not explanation. In: Proceedings of the 2019 conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp 11\u201320. https:\/\/doi.org\/10.18653\/v1\/D19-1002","DOI":"10.18653\/v1\/D19-1002"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-026-07102-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-026-07102-7","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-026-07102-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T08:28:13Z","timestamp":1775809693000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-026-07102-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4]]},"references-count":59,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2026,4]]}},"alternative-id":["7102"],"URL":"https:\/\/doi.org\/10.1007\/s10489-026-07102-7","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"value":"0924-669X","type":"print"},{"value":"1573-7497","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,4]]},"assertion":[{"value":"13 August 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 January 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 April 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"196"}}