{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T19:20:05Z","timestamp":1743016805895,"version":"3.40.3"},"publisher-location":"Cham","reference-count":27,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783031048807"},{"type":"electronic","value":"9783031048814"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-04881-4_5","type":"book-chapter","created":{"date-parts":[[2022,4,25]],"date-time":"2022-04-25T19:02:54Z","timestamp":1650913374000},"page":"54-65","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["From Captions to\u00a0Explanations: A Multimodal Transformer-based Architecture for\u00a0Natural Language Explanation Generation"],"prefix":"10.1007","author":[{"given":"Isabel","family":"Rio-Torto","sequence":"first","affiliation":[]},{"given":"Jaime S.","family":"Cardoso","sequence":"additional","affiliation":[]},{"given":"Lu\u00eds F.","family":"Teixeira","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,4,26]]},"reference":[{"key":"5_CR1","unstructured":"Adebayo, J., Gilmer, J., Muelly, M., Goodfellow, I., Hardt, M., Kim, B.: Sanity checks for saliency maps. In: Proceedings of the 32nd International Conference on Neural Information Processing Systems. NIPS 2018, Red Hook, NY, USA, pp. 9525\u20139536. Curran Associates Inc. (2018)"},{"key":"5_CR2","unstructured":"Alber, M., et al.: innvestigate neural networks! J. Mach. Learn. Res. 20(93), 1\u20138 (2019). http:\/\/jmlr.org\/papers\/v20\/18-540.html"},{"key":"5_CR3","doi-asserted-by":"publisher","unstructured":"Bowman, S.R., Angeli, G., Potts, C., Manning, C.D.: A large annotated corpus for learning natural language inference. In: Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing, Lisbon, Portugal, pp. 632\u2013642. Association for Computational Linguistics, September 2015. https:\/\/doi.org\/10.18653\/v1\/D15-1075, https:\/\/aclanthology.org\/D15-1075","DOI":"10.18653\/v1\/D15-1075"},{"key":"5_CR4","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL HLT 2019\u20132019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies - Proceedings of the Conference, vol. 1, pp. 4171\u20134186 (2019)"},{"key":"5_CR5","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2021). https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"5_CR6","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/978-3-319-46493-0_1","volume-title":"Computer Vision \u2013 ECCV 2016","author":"LA Hendricks","year":"2016","unstructured":"Hendricks, L.A., Akata, Z., Rohrbach, M., Donahue, J., Schiele, B., Darrell, T.: Generating visual explanations. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9908, pp. 3\u201319. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46493-0_1"},{"key":"5_CR7","unstructured":"Hooker, S., Erhan, D., Kindermans, P.J., Kim, B.: A benchmark for interpretability methods in deep neural networks. In: Advances in Neural Information Processing Systems 32(NeurIPS) (2019)"},{"key":"5_CR8","doi-asserted-by":"crossref","unstructured":"Kaur, H., Nori, H., Jenkins, S., Caruana, R., Wallach, H., Wortman Vaughan, J.: Interpreting interpretability: understanding data scientists\u2019 use of interpretability tools for machine learning. In: Proceedings of the 2020 CHI Conference on Human Factors in Computing Systems, pp. 1\u201314 (2020)","DOI":"10.1145\/3313831.3376219"},{"key":"5_CR9","doi-asserted-by":"crossref","unstructured":"Kayser, M., et al.: e-ViL: a dataset and benchmark for natural language explanations in vision-language tasks (2021). http:\/\/arxiv.org\/abs\/2105.03761","DOI":"10.1109\/ICCV48922.2021.00128"},{"key":"5_CR10","unstructured":"Kokhlikyan, N., et al.: Captum: a unified and generic model interpretability library for pytorch. arXiv preprint arXiv:2009.07896 (2020)"},{"key":"5_CR11","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Li","year":"2020","unstructured":"Li, X., et al.: Oscar: object-semantics aligned pre-training for vision-language tasks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 121\u2013137. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8"},{"key":"5_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"5_CR13","unstructured":"Liu, W., Chen, S., Guo, L., Zhu, X., Liu, J.: CPTR: full transformer network for image captioning. arXiv preprint arXiv:2101.10804 (2021)"},{"key":"5_CR14","doi-asserted-by":"publisher","unstructured":"Marasovi\u0107, A., Bhagavatula, C., Park, J.S., Le Bras, R., Smith, N.A., Choi, Y.: Natural language rationales with full-stack visual reasoning: from pixels to semantic frames to commonsense graphs. In: Findings of the Association for Computational Linguistics: EMNLP 2020, pp. 2810\u20132829. Association for Computational Linguistics, November 2020. https:\/\/doi.org\/10.18653\/v1\/2020.findings-emnlp.253, https:\/\/aclanthology.org\/2020.findings-emnlp.253","DOI":"10.18653\/v1\/2020.findings-emnlp.253"},{"key":"5_CR15","doi-asserted-by":"publisher","unstructured":"Park, D.H., et al.: Multimodal explanations: justifying decisions and pointing to the evidence. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 8779\u20138788. IEEE, June 2018. https:\/\/doi.org\/10.1109\/CVPR.2018.00915, https:\/\/ieeexplore.ieee.org\/document\/8579013\/","DOI":"10.1109\/CVPR.2018.00915"},{"key":"5_CR16","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I.: Language Models are Unsupervised Multitask Learners, July 2019"},{"key":"5_CR17","doi-asserted-by":"publisher","unstructured":"Rio-Torto, I., Fernandes, K., Teixeira, L.F.: Understanding the decisions of CNNs: an in-model approach. Pattern Recogn. Lett. 133(C), 373\u2013380 (2020). https:\/\/doi.org\/10.1016\/j.patrec.2020.04.004, http:\/\/www.sciencedirect.com\/science\/article\/pii\/S0167865520301240","DOI":"10.1016\/j.patrec.2020.04.004"},{"key":"5_CR18","doi-asserted-by":"publisher","unstructured":"Rudin, C.: Stop explaining black box machine learning models for high stakes decisions and use interpretable models instead. Nat. Mach. Intell. 1(5), 206\u2013215 (2019). https:\/\/doi.org\/10.1038\/s42256-019-0048-x, http:\/\/www.nature.com\/articles\/s42256-019-0048-x","DOI":"10.1038\/s42256-019-0048-x"},{"key":"5_CR19","doi-asserted-by":"crossref","unstructured":"Rudin, C., Chen, C., Chen, Z., Huang, H., Semenova, L., Zhong, C.: Interpretable machine learning: fundamental principles and 10 grand challenges, pp. 1\u201380, March 2021. http:\/\/arxiv.org\/abs\/2103.11251","DOI":"10.1214\/21-SS133"},{"issue":"11","key":"5_CR20","doi-asserted-by":"publisher","first-page":"2660","DOI":"10.1109\/TNNLS.2016.2599820","volume":"28","author":"W Samek","year":"2017","unstructured":"Samek, W., Binder, A., Montavon, G., Lapuschkin, S., M\u00fcller, K.R.: Evaluating the visualization of what a deep neural network has learned. IEEE Trans. Neural Networks Learn. Syst. 28(11), 2660\u20132673 (2017). https:\/\/doi.org\/10.1109\/TNNLS.2016.2599820","journal-title":"IEEE Trans. Neural Networks Learn. Syst."},{"key":"5_CR21","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 5999\u20136009 (2017)"},{"key":"5_CR22","doi-asserted-by":"publisher","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3156\u20133164. IEEE, June 2015. https:\/\/doi.org\/10.1109\/CVPR.2015.7298935, http:\/\/ieeexplore.ieee.org\/document\/7298935\/","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"5_CR23","doi-asserted-by":"crossref","unstructured":"Wu, J., Mooney, R.: Faithful multimodal explanation for visual question answering. In: Proceedings of the 2019 ACL Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP, pp. 103\u2013112 (2019)","DOI":"10.18653\/v1\/W19-4812"},{"key":"5_CR24","unstructured":"Xie, N., Lai, F., Doran, D., Kadav, A.: Visual entailment: a novel task for fine-grained image understanding. arXiv preprint arXiv:1901.06706 (2019)"},{"key":"5_CR25","doi-asserted-by":"crossref","unstructured":"Zhang, P., et al.: Vinvl: making visual representations matter in vision-language models. In: CVPR 2021 (2021)","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"5_CR26","unstructured":"Zhang, Y., Jiang, H., Miura, Y., Manning, C.D., Langlotz, C.P.: Contrastive learning of medical visual representations from paired images and text. arXiv preprint arXiv:2010.00747 (2020)"},{"key":"5_CR27","doi-asserted-by":"crossref","unstructured":"Zhou, L., Palangi, H., Zhang, L., Hu, H., Corso, J., Gao, J.: Unified vision-language pre-training for image captioning and VQA. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, pp. 13041\u201313049 (2020)","DOI":"10.1609\/aaai.v34i07.7005"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Image Analysis"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-04881-4_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,7]],"date-time":"2024-03-07T16:01:11Z","timestamp":1709827271000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-04881-4_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031048807","9783031048814"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-04881-4_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"26 April 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"IbPRIA","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Iberian Conference on Pattern Recognition and Image Analysis","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Aveiro","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Portugal","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 May 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6 May 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ibpria2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.ibpria.org\/2022\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"72","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"54","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"75% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}