{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T07:37:45Z","timestamp":1767339465309,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":47,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031546044"},{"type":"electronic","value":"9783031546051"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-54605-1_24","type":"book-chapter","created":{"date-parts":[[2024,3,7]],"date-time":"2024-03-07T11:43:10Z","timestamp":1709811790000},"page":"363-377","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["DeViL: Decoding Vision features into\u00a0Language"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-1249-3630","authenticated-orcid":false,"given":"Meghal","family":"Dani","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2302-8597","authenticated-orcid":false,"given":"Isabel","family":"Rio-Torto","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3541-2163","authenticated-orcid":false,"given":"Stephan","family":"Alaniz","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1432-7747","authenticated-orcid":false,"given":"Zeynep","family":"Akata","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,3,8]]},"reference":[{"key":"24_CR1","unstructured":"Adebayo, J., Gilmer, J., Muelly, M., Goodfellow, I.J., Hardt, M., Kim, B.: Sanity checks for saliency maps. In: NeurIPS (2018)"},{"key":"24_CR2","doi-asserted-by":"crossref","unstructured":"Agrawal, P., Carreira, J., Malik, J.: Learning to see by moving. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.13"},{"key":"24_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"382","DOI":"10.1007\/978-3-319-46454-1_24","volume-title":"Computer Vision \u2013 ECCV 2016","author":"P Anderson","year":"2016","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S.: SPICE: semantic propositional image caption evaluation. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9909, pp. 382\u2013398. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_24"},{"key":"24_CR4","doi-asserted-by":"crossref","unstructured":"Bau, D., Zhou, B., Khosla, A., Oliva, A., Torralba, A.: Network dissection: Quantifying interpretability of deep visual representations. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.354"},{"key":"24_CR5","doi-asserted-by":"crossref","unstructured":"Bohle, M., Fritz, M., Schiele, B.: Convolutional dynamic alignment networks for interpretable classifications. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00990"},{"key":"24_CR6","doi-asserted-by":"crossref","unstructured":"B\u00f6hle, M., Fritz, M., Schiele, B.: B-cos networks: alignment is all we need for interpretability. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01008"},{"key":"24_CR7","unstructured":"Brendel, W., Bethge, M.: Approximating CNNs with bag-of-local-features models works surprisingly well on imageNet. In: ICLR (2019)"},{"key":"24_CR8","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., Soricut, R.: Conceptual 12m: pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"24_CR9","doi-asserted-by":"crossref","unstructured":"Chattopadhay, A., Sarkar, A., Howlader, P., Balasubramanian, V.N.: Grad-cam++: generalized gradient-based visual explanations for deep convolutional networks. In: WACV (2018)","DOI":"10.1109\/WACV.2018.00097"},{"key":"24_CR10","unstructured":"Chen, C., Li, O., Tao, D., Barnett, A., Rudin, C., Su, J.K.: This looks like that: deep learning for interpretable image recognition. In: NeurIPS (2019)"},{"key":"24_CR11","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"24_CR12","doi-asserted-by":"crossref","unstructured":"Denkowski, M., Lavie, A.: Meteor universal: Language specific translation evaluation for any target language. In: WMT (2014)","DOI":"10.3115\/v1\/W14-3348"},{"key":"24_CR13","doi-asserted-by":"publisher","first-page":"e30","DOI":"10.23915\/distill.00030","volume":"6","author":"G Goh","year":"2021","unstructured":"Goh, G., et al.: Multimodal neurons in artificial neural networks. Distill 6, e30 (2021)","journal-title":"Distill"},{"key":"24_CR14","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"24_CR15","unstructured":"Hernandez, E., Schwettmann, S., Bau, D., Bagashvili, T., Torralba, A., Andreas, J.: Natural language descriptions of deep visual features. In: ICLR (2022)"},{"key":"24_CR16","doi-asserted-by":"crossref","unstructured":"Hu, X., et al.: Scaling up vision-language pre-training for image captioning. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01745"},{"key":"24_CR17","doi-asserted-by":"crossref","unstructured":"Kayser, M., et al.: e-vil: a dataset and benchmark for natural language explanations in vision-language tasks. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00128"},{"key":"24_CR18","unstructured":"Kim, B., et al.: Interpretability beyond feature attribution: quantitative testing with concept activation vectors (TCAV). In: ICML (2018)"},{"key":"24_CR19","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. In: NeurIPS (2012)"},{"key":"24_CR20","unstructured":"Lin, C.Y.: Rouge: a package for automatic evaluation of summaries. In: WAS (2004)"},{"key":"24_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"24_CR22","doi-asserted-by":"crossref","unstructured":"Liu, W., et al.: Decoupled networks. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00293"},{"key":"24_CR23","unstructured":"Lundberg, S.M., Lee, S.: A unified approach to interpreting model predictions. In: Guyon, I., von Luxburg, U., Bengio, S., Wallach, H.M., Fergus, R., Vishwanathan, S.V.N., Garnett, R. (eds.) NeurIPS (2017)"},{"key":"24_CR24","unstructured":"Majumder, B.P., Camburu, O., Lukasiewicz, T., Mcauley, J.: Knowledge-grounded self-rationalization via extractive and natural language explanations. In: ICML (2022)"},{"key":"24_CR25","doi-asserted-by":"crossref","unstructured":"Marasovi\u0107, A., Bhagavatula, C., sung Park, J., Le Bras, R., Smith, N.A., Choi, Y.: Natural language rationales with full-stack visual reasoning: from pixels to semantic frames to commonsense graphs. In: EMNLP (2020)","DOI":"10.18653\/v1\/2020.findings-emnlp.253"},{"key":"24_CR26","unstructured":"Mokady, R., Hertz, A., Bermano, A.H.: Clipcap: clip prefix for image captioning. arXiv preprint arXiv:2111.09734 (2021)"},{"key":"24_CR27","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: ACL (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"24_CR28","unstructured":"Petsiuk, V., Das, A., Saenko, K.: RISE: randomized input sampling for explanation of black-box models. In: BMVC (2018)"},{"key":"24_CR29","unstructured":"Pl\u00fcster, B., Ambsdorf, J., Braach, L., Lee, J.H., Wermter, S.: Harnessing the power of multi-task pretraining for ground-truth level natural language explanations. arXiv preprint arXiv:2212.04231 (2022)"},{"key":"24_CR30","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"24_CR31","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I.: Language models are unsupervised multitask learners (2019)"},{"key":"24_CR32","doi-asserted-by":"crossref","unstructured":"Sammani, F., Mukherjee, T., Deligiannis, N.: NLX-GPT: a model for natural language explanations in vision and vision-language tasks. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00814"},{"key":"24_CR33","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., Batra, D.: Grad-cam: visual explanations from deep networks via gradient-based localization. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.74"},{"key":"24_CR34","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: ACL (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"24_CR35","unstructured":"Smilkov, D., Thorat, N., Kim, B., Vi\u00e9gas, F., Wattenberg, M.: Smoothgrad: removing noise by adding noise (2017)"},{"key":"24_CR36","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava, N., Hinton, G.E., Krizhevsky, A., Sutskever, I., Salakhutdinov, R.: Dropout: a simple way to prevent neural networks from overfitting. JMLR 15, 1929\u20131958 (2014)","journal-title":"JMLR"},{"key":"24_CR37","unstructured":"Sundararajan, M., Taly, A., Yan, Q.: Axiomatic attribution for deep networks. In: ICML (2017)"},{"key":"24_CR38","doi-asserted-by":"crossref","unstructured":"Szegedy, C., et al.: Going deeper with convolutions. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"24_CR39","doi-asserted-by":"crossref","unstructured":"Tewel, Y., Shalev, Y., Schwartz, I., Wolf, L.: Zerocap: zero-shot image-to-text generation for visual-semantic arithmetic. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01739"},{"key":"24_CR40","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., Parikh, D.: Cider: consensus-based image description evaluation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"24_CR41","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"818","DOI":"10.1007\/978-3-319-10590-1_53","volume-title":"Computer Vision \u2013 ECCV 2014","author":"MD Zeiler","year":"2014","unstructured":"Zeiler, M.D., Fergus, R.: Visualizing and understanding convolutional networks. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8689, pp. 818\u2013833. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10590-1_53"},{"key":"24_CR42","unstructured":"Zhang, S., et al.: Opt: open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)"},{"key":"24_CR43","unstructured":"Zhang, T., Kishore, V., Wu, F., Weinberger, K.Q., Artzi, Y.: Bertscore: evaluating text generation with Bert. In: ICLR (2020)"},{"key":"24_CR44","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., Torralba, A.: Learning deep features for discriminative localization. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.319"},{"key":"24_CR45","doi-asserted-by":"crossref","unstructured":"Zhou, B., Lapedriza, A., Khosla, A., Oliva, A., Torralba, A.: Places: a 10 million image database for scene recognition. IEEE T-PAMI (2017)","DOI":"10.1167\/17.10.296"},{"key":"24_CR46","doi-asserted-by":"crossref","unstructured":"Zhou, L., Palangi, H., Zhang, L., Hu, H., Corso, J., Gao, J.: Unified vision-language pre-training for image captioning and vqa. In: AAAI (2020)","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"24_CR47","doi-asserted-by":"crossref","unstructured":"Zoumpourlis, G., Doumanoglou, A., Vretos, N., Daras, P.: Non-linear convolution filters for CNN-based learning. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.510"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-54605-1_24","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,7]],"date-time":"2024-03-07T12:08:32Z","timestamp":1709813312000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-54605-1_24"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031546044","9783031546051"],"references-count":47,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-54605-1_24","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"8 March 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"DAGM GCPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"DAGM German Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Heidelberg","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19 September 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"22 September 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"45","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"dagm2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.dagm-gcpr.de\/year\/2023","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"76","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"40","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"53% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}