{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T16:59:25Z","timestamp":1743094765454,"version":"3.40.3"},"publisher-location":"Cham","reference-count":50,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031546044"},{"type":"electronic","value":"9783031546051"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-54605-1_25","type":"book-chapter","created":{"date-parts":[[2024,3,7]],"date-time":"2024-03-07T11:43:10Z","timestamp":1709811790000},"page":"378-393","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Zero-Shot Translation of\u00a0Attention Patterns in\u00a0VQA Models to\u00a0Natural Language"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8531-3011","authenticated-orcid":false,"given":"Leonard","family":"Salewski","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5807-0576","authenticated-orcid":false,"given":"A. Sophia","family":"Koepke","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3616-8668","authenticated-orcid":false,"given":"Hendrik P. A.","family":"Lensch","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1432-7747","authenticated-orcid":false,"given":"Zeynep","family":"Akata","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,3,8]]},"reference":[{"key":"25_CR1","doi-asserted-by":"crossref","unstructured":"Abnar, S., Zuidema, W.: Quantifying attention flow in transformers. In: ACL (2020)","DOI":"10.18653\/v1\/2020.acl-main.385"},{"key":"25_CR2","doi-asserted-by":"crossref","unstructured":"Agrawal, A., et al.: Vqa: visual question answering. Int. J. Comput. Vis. (2015)","DOI":"10.1007\/s11263-016-0966-6"},{"key":"25_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"382","DOI":"10.1007\/978-3-319-46454-1_24","volume-title":"Computer Vision \u2013 ECCV 2016","author":"P Anderson","year":"2016","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S.: SPICE: semantic propositional image caption evaluation. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9909, pp. 382\u2013398. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_24"},{"key":"25_CR4","doi-asserted-by":"publisher","first-page":"e0130140","DOI":"10.1371\/journal.pone.0130140","volume":"10","author":"S Bach","year":"2015","unstructured":"Bach, S., Binder, A., Montavon, G., Klauschen, F., M\u00fcller, K.R., Samek, W.: On pixel-wise explanations for non-linear classifier decisions by layer-wise relevance propagation. PloS one 10, e0130140 (2015)","journal-title":"PloS one"},{"key":"25_CR5","unstructured":"Banerjee, S., Lavie, A.: Meteor: An automatic metric for MT evaluation with improved correlation with human judgments. In: IEEvaluation@ACL (2005)"},{"key":"25_CR6","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"25_CR7","doi-asserted-by":"crossref","unstructured":"Chefer, H., Gur, S., Wolf, L.: Generic attention-model explainability for interpreting bi-modal and encoder-decoder transformers. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00045"},{"key":"25_CR8","doi-asserted-by":"crossref","unstructured":"Chen, S., Zhao, Q.: Rex: Reasoning-aware and grounded explanation. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01514"},{"key":"25_CR9","unstructured":"Draelos, R.L., Carin, L.: Use HiReSCAM instead of grad-cam for faithful explanations of convolutional neural networks (2020)"},{"key":"25_CR10","doi-asserted-by":"crossref","unstructured":"Fong, R., Patrick, M., Vedaldi, A.: Understanding deep networks via extremal perturbations and smooth masks. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00304"},{"key":"25_CR11","doi-asserted-by":"crossref","unstructured":"Fong, R.C., Vedaldi, A.: Interpretable explanations of black boxes by meaningful perturbation. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.371"},{"key":"25_CR12","unstructured":"Fu, R., Hu, Q., Dong, X., Guo, Y., Gao, Y., Li, B.: Axiom-based grad-cam: towards accurate visualization and explanation of CNNs. In: BMVC (2020)"},{"key":"25_CR13","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the V in VQA matter: elevating the role of image understanding in visual question answering. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.670"},{"key":"25_CR14","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for real-world visual reasoning and compositional question answering. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"25_CR15","doi-asserted-by":"crossref","unstructured":"Huk Park, D., et al.: Multimodal explanations: justifying decisions and pointing to the evidence. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00915"},{"key":"25_CR16","unstructured":"Jain, S., Wallace, B.C.: Attention is not explanation. In: North American Chapter of the Association for Computational Linguistics (2019)"},{"key":"25_CR17","doi-asserted-by":"crossref","unstructured":"Johnson, J., Hariharan, B., van der Maaten, L., Fei-Fei, L., Zitnick, C.L., Girshick, R.B.: Clevr: a diagnostic dataset for compositional language and elementary visual reasoning. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.215"},{"key":"25_CR18","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"280","DOI":"10.1007\/978-3-031-19775-8_17","volume-title":"ECCV 2022","author":"SS Kim","year":"2022","unstructured":"Kim, S.S., Meister, N., Ramaswamy, V.V., Fong, R., Russakovsky, O.: Hive: evaluating the human interpretability of visual explanations. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13672, pp. 280\u2013298. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19775-8_17"},{"key":"25_CR19","unstructured":"Kim, W., Son, B., Kim, I.: Vilt: vision-and-language transformer without convolution or region supervision. In: International Conference on Machine Learning (2021)"},{"key":"25_CR20","unstructured":"Li, J., Selvaraju, R.R., Gotmare, A.D., Joty, S.R., Xiong, C., Hoi, S.C.H.: Align before fuse: vision and language representation learning with momentum distillation. In: NeurIPS (2021)"},{"key":"25_CR21","doi-asserted-by":"crossref","unstructured":"Li, Q., Tao, Q., Joty, S., Cai, J., Luo, J.: VQA-E: explaining, elaborating, and enhancing your answers for visual questions. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01234-2_34"},{"key":"25_CR22","unstructured":"Li, W., Zhu, L., Wen, L., Yang, Y.: DeCap: decoding CLIP latents for zero-shot captioning via text-only training. In: The Eleventh International Conference on Learning Representations (2023)"},{"key":"25_CR23","unstructured":"Lin, C.Y.: Rouge: a package for automatic evaluation of summaries. In: ACL (2004)"},{"key":"25_CR24","doi-asserted-by":"crossref","unstructured":"Muhammad, M.B., Yeasin, M.: Eigen-cam: class activation map using principal components. In: IJCNN (2020)","DOI":"10.1109\/IJCNN48605.2020.9206626"},{"key":"25_CR25","first-page":"27730","volume":"35","author":"L Ouyang","year":"2022","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. NeurIPS 35, 27730\u201327744 (2022)","journal-title":"NeurIPS"},{"key":"25_CR26","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method and for automatic and evaluation of machine and translation. In: ACL (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"25_CR27","unstructured":"Petsiuk, V., Das, A., Saenko, K.: Rise: randomized input sampling for explanation of black-box models. In: BMVC (2018)"},{"key":"25_CR28","doi-asserted-by":"crossref","unstructured":"Pillai, V., Pirsiavash, H.: Explainable models with consistent interpretations. In: AAAI (2021)","DOI":"10.1609\/aaai.v35i3.16344"},{"key":"25_CR29","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"25_CR30","doi-asserted-by":"publisher","unstructured":"Salewski, L., Koepke, A.S., Lensch, H., Akata, Z.: Clevr-x: a visual reasoning dataset for natural language explanations. In: Holzinger, A., Goebel, R., Fong, R., Moon, T., M\u00fcller, KR., Samek, W. (eds.) xxAI 2020. LNCS, vol. 13200, pp. 69\u201388. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-04083-2_5","DOI":"10.1007\/978-3-031-04083-2_5"},{"key":"25_CR31","doi-asserted-by":"crossref","unstructured":"Sammani, F., Mukherjee, T., Deligiannis, N.: Nlx-gpt: a model for natural language explanations in vision and vision-language tasks. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00814"},{"key":"25_CR32","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., Das, A., Vedantam, R., Cogswell, M., Parikh, D., Batra, D.: Grad-CAM: visual explanations from deep networks via gradient-based localization. Int. J. Comput. Vis. (2019)","DOI":"10.1007\/s11263-019-01228-7"},{"key":"25_CR33","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., Batra, D.: Grad-cam: visual explanations from deep networks via gradient-based localization. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.74"},{"key":"25_CR34","unstructured":"Simonyan, K., Vedaldi, A., Zisserman, A.: Deep inside convolutional networks: visualising image classification models and saliency maps. In: ICLR workshop (2013)"},{"key":"25_CR35","unstructured":"Smilkov, D., Thorat, N., Kim, B., Vi\u00e9gas, F., Wattenberg, M.: Smoothgrad: removing noise by adding noise. arXiv:1706.03825 (2017)"},{"key":"25_CR36","unstructured":"Springenberg, J.T., Dosovitskiy, A., Brox, T., Riedmiller, M.: Striving for simplicity: the all convolutional net. In: ICLR (Workshop Track) (2015)"},{"key":"25_CR37","unstructured":"Su, Y., et al.: Language models can see: plugging visual controls in text generation. arXiv:2205.02655 (2022)"},{"key":"25_CR38","unstructured":"Su, Y., Lan, T., Wang, Y., Yogatama, D., Kong, L., Collier, N.: A contrastive framework for neural text generation. In: Oh, A.H., Agarwal, A., Belgrave, D., Cho, K. (eds.) Advances in Neural Information Processing Systems (2022)"},{"key":"25_CR39","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: Lxmert: learning cross-modality encoder representations from transformers. arXiv:1908.07490 (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"25_CR40","unstructured":"Tewel, Y., Shalev, Y., Nadler, R., Schwartz, I., Wolf, L.: Zero-shot video captioning with evolving pseudo-tokens. arXiv:2207.11100 (2022)"},{"key":"25_CR41","doi-asserted-by":"crossref","unstructured":"Tewel, Y., Shalev, Y., Schwartz, I., Wolf, L.: Zerocap: zero-shot image-to-text generation for visual-semantic arithmetic. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01739"},{"key":"25_CR42","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Zitnick, C.L., Parikh, D.: Cider: consensus-based image description evaluation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"25_CR43","doi-asserted-by":"crossref","unstructured":"Voita, E., Talbot, D., Moiseev, F., Sennrich, R., Titov, I.: Analyzing multi-head self-attention: specialized heads do the heavy lifting, the rest can be pruned. In: ACL (2019)","DOI":"10.18653\/v1\/P19-1580"},{"key":"25_CR44","unstructured":"Wang, J., Zhang, Y., Yan, M., Zhang, J.C., Sang, J.: Zero-shot image captioning by anchor-augmented vision-language space alignment. arXiv:2211.07275 (2022)"},{"key":"25_CR45","doi-asserted-by":"crossref","unstructured":"Wiegreffe, S., Pinter, Y.: Attention is not not explanation. In: Conference on Empirical Methods in Natural Language Processing (2019)","DOI":"10.18653\/v1\/D19-1002"},{"key":"25_CR46","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"818","DOI":"10.1007\/978-3-319-10590-1_53","volume-title":"Computer Vision \u2013 ECCV 2014","author":"MD Zeiler","year":"2014","unstructured":"Zeiler, M.D., Fergus, R.: Visualizing and understanding convolutional networks. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8689, pp. 818\u2013833. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10590-1_53"},{"key":"25_CR47","unstructured":"Zeng, A., et al.: Socratic models: composing zero-shot multimodal reasoning with language. In: ICLR (2023)"},{"key":"25_CR48","doi-asserted-by":"publisher","first-page":"1084","DOI":"10.1007\/s11263-017-1059-x","volume":"126","author":"J Zhang","year":"2018","unstructured":"Zhang, J., Bargal, S.A., Lin, Z., Brandt, J., Shen, X., Sclaroff, S.: Top-down neural attention by excitation backprop. Int. J. Comput. Vis. 126, 1084\u20131102 (2018)","journal-title":"Int. J. Comput. Vis."},{"key":"25_CR49","unstructured":"Zhang, S., et al.: Opt: open pre-trained transformer language models. arXiv:2205.01068 (2022)"},{"key":"25_CR50","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., Torralba, A.: Learning deep features for discriminative localization. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.319"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-54605-1_25","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,7]],"date-time":"2024-03-07T12:08:19Z","timestamp":1709813299000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-54605-1_25"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031546044","9783031546051"],"references-count":50,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-54605-1_25","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"8 March 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"DAGM GCPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"DAGM German Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Heidelberg","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19 September 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"22 September 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"45","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"dagm2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.dagm-gcpr.de\/year\/2023","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"76","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"40","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"53% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}