{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,10]],"date-time":"2025-09-10T23:25:25Z","timestamp":1757546725625,"version":"3.40.3"},"publisher-location":"Cham","reference-count":38,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031789762"},{"type":"electronic","value":"9783031789779"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78977-9_3","type":"book-chapter","created":{"date-parts":[[2025,1,27]],"date-time":"2025-01-27T10:13:32Z","timestamp":1737972812000},"page":"36-51","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Evaluating the\u00a0Reliability of\u00a0Self-explanations in\u00a0Large Language Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7938-2747","authenticated-orcid":false,"given":"Korbinian","family":"Randl","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9188-7425","authenticated-orcid":false,"given":"John","family":"Pavlopoulos","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9731-1048","authenticated-orcid":false,"given":"Aron","family":"Henriksson","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7713-1381","authenticated-orcid":false,"given":"Tony","family":"Lindgren","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,1,28]]},"reference":[{"key":"3_CR1","doi-asserted-by":"crossref","unstructured":"Abnar, S., Zuidema, W.: Quantifying attention flow in transformers. In: Jurafsky, D., Chai, J., Schluter, N., Tetreault, J. (eds.) Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 4190\u20134197 (2020)","DOI":"10.18653\/v1\/2020.acl-main.385"},{"issue":"1","key":"3_CR2","first-page":"147","volume":"9","author":"DH Ackley","year":"1985","unstructured":"Ackley, D.H., Hinton, G.E., Sejnowski, T.J.: A learning algorithm for boltzmann machines. Cogn. Sci. 9(1), 147\u2013169 (1985)","journal-title":"Cogn. Sci."},{"key":"3_CR3","doi-asserted-by":"publisher","unstructured":"Almazrouei, E., Alobeidli, H., Alshamsi, A., et\u00a0al.: The falcon series of open language models (2023). https:\/\/doi.org\/10.48550\/arXiv.2311.16867","DOI":"10.48550\/arXiv.2311.16867"},{"issue":"7","key":"3_CR4","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1371\/journal.pone.0130140","volume":"10","author":"S Bach","year":"2015","unstructured":"Bach, S., Binder, A., Montavon, G., Klauschen, F., M\u00fcller, K.R., Samek, W.: On pixel-wise explanations for non-linear classifier decisions by layer-wise relevance propagation. PLoS ONE 10(7), 1\u201346 (2015)","journal-title":"PLoS ONE"},{"key":"3_CR5","unstructured":"Brown, T., Mann, B., Ryder, N., et al.: Language models are few-shot learners. In: Larochelle, H., Ranzato, M., Hadsell, R., Balcan, M., Lin, H. (eds.) Advances in Neural Information Processing Systems, vol.\u00a033, pp. 1877\u20131901 (2020)"},{"key":"3_CR6","doi-asserted-by":"crossref","unstructured":"Chefer, H., Gur, S., Wolf, L.: Transformer interpretability beyond attention visualization. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 782\u2013791 (2021)","DOI":"10.1109\/CVPR46437.2021.00084"},{"key":"3_CR7","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Burstein, J., Doran, C., Solorio, T. (eds.) Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 4171\u20134186 (2019)"},{"key":"3_CR8","doi-asserted-by":"publisher","unstructured":"Dubey, A., Jauhri, A., Pandey, A., et\u00a0al.: The llama 3 herd of models (2024). https:\/\/doi.org\/10.48550\/arXiv.2407.21783","DOI":"10.48550\/arXiv.2407.21783"},{"key":"3_CR9","doi-asserted-by":"crossref","unstructured":"Gurrapu, S., Kulkarni, A., Huang, L., Lourentzou, I., Batarseh, F.A.: Rationalization for explainable NLP: a survey. Front. Artif. Intell. 6 (2023)","DOI":"10.3389\/frai.2023.1225093"},{"key":"3_CR10","doi-asserted-by":"publisher","unstructured":"Hechtlinger, Y.: Interpretation of prediction models using the input gradient (2016). https:\/\/doi.org\/10.48550\/arXiv.1611.07634","DOI":"10.48550\/arXiv.1611.07634"},{"key":"3_CR11","doi-asserted-by":"publisher","unstructured":"Holtzman, A., Buys, J., Du, L., Forbes, M., Choi, Y.: The curious case of neural text degeneration (2020). https:\/\/doi.org\/10.48550\/arXiv.1904.09751","DOI":"10.48550\/arXiv.1904.09751"},{"key":"3_CR12","doi-asserted-by":"publisher","unstructured":"Huang, S., Mamidanna, S., Jangam, S., Zhou, Y., Gilpin, L.H.: Can large language models explain themselves? A study of LLM-generated self-explanations (2023). https:\/\/doi.org\/10.48550\/arXiv.2310.11207","DOI":"10.48550\/arXiv.2310.11207"},{"key":"3_CR13","doi-asserted-by":"publisher","unstructured":"Jiang, A.Q., Sablayrolles, A., Mensch, A., et\u00a0al.: Mistral 7b (2023). https:\/\/doi.org\/10.48550\/arXiv.2310.06825","DOI":"10.48550\/arXiv.2310.06825"},{"key":"3_CR14","doi-asserted-by":"crossref","unstructured":"Lewis, M., et al.: BART: denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. In: Jurafsky, D., Chai, J., Schluter, N., Tetreault, J. (eds.) Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 7871\u20137880 (2020)","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"3_CR15","doi-asserted-by":"publisher","unstructured":"Li, J., Monroe, W., Jurafsky, D.: Understanding neural networks through representation erasure (2017). https:\/\/doi.org\/10.48550\/arXiv.1612.08220","DOI":"10.48550\/arXiv.1612.08220"},{"key":"3_CR16","unstructured":"Li, Z., Xu, P., Liu, F., Song, H.: Towards understanding in-context learning with contrastive demonstrations and saliency maps. CoRR abs\/2307.05052 (2023)"},{"key":"3_CR17","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"3_CR18","doi-asserted-by":"crossref","unstructured":"Liu, S., Le, F., Chakraborty, S., Abdelzaher, T.: On exploring attention-based explanation for transformer models in text classification. In: 2021 IEEE International Conference on Big Data (Big Data), pp. 1193\u20131203 (2021)","DOI":"10.1109\/BigData52589.2021.9671639"},{"key":"3_CR19","doi-asserted-by":"publisher","unstructured":"Liu, Y., et al.: Roberta: a robustly optimized bert pretraining approach (2019). https:\/\/doi.org\/10.48550\/arXiv.1907.11692","DOI":"10.48550\/arXiv.1907.11692"},{"key":"3_CR20","unstructured":"Lundberg, S.M., Lee, S.I.: A unified approach to interpreting model predictions. In: Guyon, I., et al. (eds.) Advances in Neural Information Processing Systems 30, pp. 4765\u20134774. Curran Associates, Inc. (2017)"},{"key":"3_CR21","doi-asserted-by":"crossref","unstructured":"Madsen, A., Chandar, S., Reddy, S.: Are self-explanations from large language models faithful? In: Ku, L.W., Martins, A., Srikumar, V. (eds.) Findings of the Association for Computational Linguistics ACL 2024, pp. 295\u2013337 (2024)","DOI":"10.18653\/v1\/2024.findings-acl.19"},{"key":"3_CR22","doi-asserted-by":"publisher","unstructured":"Mesnard, T., Hardin, C., Dadashi, R., et\u00a0al.: Gemma: open models based on gemini research and technology (2024). https:\/\/doi.org\/10.48550\/arXiv.2403.08295","DOI":"10.48550\/arXiv.2403.08295"},{"key":"3_CR23","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1016\/j.patcog.2016.11.008","volume":"65","author":"G Montavon","year":"2017","unstructured":"Montavon, G., Lapuschkin, S., Binder, A., Samek, W., M\u00fcller, K.R.: Explaining nonlinear classification decisions with deep taylor decomposition. Pattern Recognit. 65, 211\u2013222 (2017)","journal-title":"Pattern Recognit."},{"key":"3_CR24","doi-asserted-by":"crossref","unstructured":"Pang, B., Lee, L.: Seeing stars: exploiting class relationships for sentiment categorization with respect to rating scales. In: Knight, K., Ng, H.T., Oflazer, K. (eds.) Proceedings of the 43rd Annual Meeting of the Association for Computational Linguistics (ACL 2005), pp. 115\u2013124 (2005)","DOI":"10.3115\/1219840.1219855"},{"key":"3_CR25","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: Isabelle, P., Charniak, E., Lin, D. (eds.) Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"3_CR26","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(1) (2020)"},{"key":"3_CR27","doi-asserted-by":"publisher","unstructured":"Randl, K., Karvounis, M., Marinos, G., Pavlopoulos, J., Lindgren, T., Henriksson, A.: Food recall incidents (2024). https:\/\/doi.org\/10.5281\/zenodo.10820657","DOI":"10.5281\/zenodo.10820657"},{"key":"3_CR28","doi-asserted-by":"crossref","unstructured":"Ribeiro, M.T., Singh, S., Guestrin, C.: \u201cWhy should i trust you?\u201d: explaining the predictions of any classifier. In: Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, KDD 2016, pp. 1135\u20131144 (2016)","DOI":"10.1145\/2939672.2939778"},{"key":"3_CR29","unstructured":"Shrikumar, A., Greenside, P., Kundaje, A.: Learning important features through propagating activation differences. In: Proceedings of the 34th International Conference on Machine Learning - Volume 70, ICML 2017, pp. 3145\u20133153 (2017)"},{"key":"3_CR30","doi-asserted-by":"crossref","unstructured":"Socher, R., et al.: Recursive deep models for semantic compositionality over a sentiment treebank. In: Yarowsky, D., Baldwin, T., Korhonen, A., Livescu, K., Bethard, S. (eds.) Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing, pp. 1631\u20131642. Association for Computational Linguistics, Seattle, Washington, USA (2013)","DOI":"10.18653\/v1\/D13-1170"},{"key":"3_CR31","unstructured":"Sundararajan, M., Taly, A., Yan, Q.: Axiomatic attribution for deep networks. In: Proceedings of the 34th International Conference on Machine Learning - Volume 70, ICML 2017, pp. 3319\u20133328 (2017)"},{"key":"3_CR32","doi-asserted-by":"publisher","unstructured":"Touvron, H., Martin, L., Stone, K., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models (2023). https:\/\/doi.org\/10.48550\/arXiv.2307.09288","DOI":"10.48550\/arXiv.2307.09288"},{"key":"3_CR33","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Proceedings of the 31st International Conference on Neural Information Processing Systems, NIPS 2017, pp. 6000\u20136010 (2017)"},{"key":"3_CR34","doi-asserted-by":"crossref","unstructured":"Verma, S., Boonsanong, V., Hoang, M., Hines, K., Dickerson, J., Shah, C.: Counterfactual explanations and algorithmic recourses for machine learning: a review. ACM Comput. Surv. (2024)","DOI":"10.1145\/3677119"},{"key":"3_CR35","unstructured":"Wei, J., et al.: Chain-of-thought prompting elicits reasoning in large language models. In: Koyejo, S., Mohamed, S., Agarwal, A., Belgrave, D., Cho, K., Oh, A. (eds.) Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans, LA, USA, 28 November\u20139 December 2022 (2022)"},{"key":"3_CR36","unstructured":"Yuan, W., Neubig, G., Liu, P.: Bartscore: evaluating generated text as text generation. In: Ranzato, M., Beygelzimer, A., Dauphin, Y., Liang, P., Vaughan, J.W. (eds.) Advances in Neural Information Processing Systems, vol.\u00a034, pp. 27263\u201327277 (2021)"},{"key":"3_CR37","doi-asserted-by":"crossref","unstructured":"Zaidan, O., Eisner, J.: Modeling annotators: a generative approach to learning from annotator rationales. In: Lapata, M., Ng, H.T. (eds.) Proceedings of the 2008 Conference on Empirical Methods in Natural Language Processing, pp. 31\u201340 (2008)","DOI":"10.3115\/1613715.1613721"},{"key":"3_CR38","doi-asserted-by":"crossref","unstructured":"Zhao, H., et al.: Explainability for large language models: a survey. ACM Trans. Intell. Syst. Technol. 15(2) (2024)","DOI":"10.1145\/3639372"}],"container-title":["Lecture Notes in Computer Science","Discovery Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78977-9_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,27]],"date-time":"2025-01-27T10:13:49Z","timestamp":1737972829000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78977-9_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031789762","9783031789779"],"references-count":38,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78977-9_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"28 January 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors declare no competing interests.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"DS","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Discovery Science","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Pisa","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"dis2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/ds2024.isti.cnr.it\/index.html","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}