{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,21]],"date-time":"2026-03-21T02:18:10Z","timestamp":1774059490057,"version":"3.50.1"},"publisher-location":"Cham","reference-count":70,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730382","type":"print"},{"value":"9783031730399","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73039-9_11","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T14:57:07Z","timestamp":1730300227000},"page":"183-201","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Evolving Interpretable Visual Classifiers with\u00a0Large Language Models"],"prefix":"10.1007","author":[{"given":"Mia","family":"Chiquier","sequence":"first","affiliation":[]},{"given":"Utkarsh","family":"Mall","sequence":"additional","affiliation":[]},{"given":"Carl","family":"Vondrick","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"11_CR1","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML, pp.\u00a08748\u20138763. PMLR (2021)"},{"key":"11_CR2","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., et al.: LAION-5B: an open large-scale dataset for training next generation image-text models. Adv. Neural. Inf. Process. Syst. 35, 25278\u201325294 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"11_CR3","unstructured":"Ferrari, V., Zisserman, A.: Learning visual attributes. In: Advances in Neural Information Processing Systems, vol.\u00a020 (2007)"},{"key":"11_CR4","doi-asserted-by":"crossref","unstructured":"Farhadi, A., Endres, I., Hoiem, D., Forsyth, D.: Describing objects by their attributes. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp.\u00a01778\u20131785. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206772"},{"key":"11_CR5","unstructured":"Menon, S., Vondrick, C.: Visual classification via description from large language models. In: ICLR (2023)"},{"key":"11_CR6","unstructured":"Koh, P.W., et al.: Concept bottleneck models. In: ICML (2020)"},{"key":"11_CR7","doi-asserted-by":"crossref","unstructured":"Roth, K., Kim, J.M., Koepke, A., Vinyals, O., Schmid, C., Akata, Z.: Waffling around for performance: visual classification with random words and broad concepts. CoRR (2023)","DOI":"10.1109\/ICCV51070.2023.01443"},{"key":"11_CR8","doi-asserted-by":"crossref","unstructured":"Holland, J.H.: Adaptation in Natural and Artificial Systems: An Introductory Analysis with Applications to Biology, Control, and Artificial Intelligence. MIT Press, Cambridge (1992)","DOI":"10.7551\/mitpress\/1090.001.0001"},{"key":"11_CR9","doi-asserted-by":"crossref","unstructured":"Van\u00a0Horn, G., Cole, E., Beery, S., Wilber, K., Belongie, S., Mac\u00a0Aodha, O.: Benchmarking representation learning for natural world image collections. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.\u00a012884\u201312893 (2021)","DOI":"10.1109\/CVPR46437.2021.01269"},{"key":"11_CR10","unstructured":"Alper, M., Averbuch-Elor, H.: Kiki or Bouba? Sound symbolism in vision-and-language models. In: Advances in Neural Information Processing Systems, vol.\u00a036 (2024)"},{"key":"11_CR11","doi-asserted-by":"crossref","unstructured":"Huang, S., Xu, Z., Tao, D., Zhang, Y.: Part-stacked CNN for fine-grained visual categorization. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.132"},{"key":"11_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"122","DOI":"10.1007\/978-3-030-01237-3_8","volume-title":"Computer Vision \u2013 ECCV 2018","author":"B Zhou","year":"2018","unstructured":"Zhou, B., Sun, Y., Bau, D., Torralba, A.: Interpretable basis decomposition for visual explanation. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11212, pp. 122\u2013138. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01237-3_8"},{"key":"11_CR13","doi-asserted-by":"crossref","unstructured":"Tang, L., Wertheimer, D., Hariharan, B.: Revisiting pose-normalization for fine-grained few-shot recognition. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01436"},{"key":"11_CR14","doi-asserted-by":"crossref","unstructured":"Lampert, C.H., Nickisch, H., Harmeling, S.: Attribute-based classification for zero-shot visual object categorization (2013)","DOI":"10.1109\/TPAMI.2013.140"},{"key":"11_CR15","unstructured":"Frome, A., et al.: DeViSE: a deep visual-semantic embedding model. In: NeurIPS (2013)"},{"key":"11_CR16","doi-asserted-by":"crossref","unstructured":"Akata, Z., Reed, S., Walter, D., Lee, H., Schiele, B.: Evaluation of output embeddings for fine-grained image classification. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298911"},{"key":"11_CR17","unstructured":"Romera-Paredes, B., Torr, P.: An embarrassingly simple approach to zero-shot learning. In: ICML (2015)"},{"key":"11_CR18","doi-asserted-by":"crossref","unstructured":"Kodirov, E., Xiang, T., Gong, S.: Semantic autoencoder for zero-shot learning. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.473"},{"key":"11_CR19","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., Batra, D.: Grad-CAM: visual explanations from deep networks via gradient-based localization. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.74"},{"key":"11_CR20","doi-asserted-by":"crossref","unstructured":"Chefer, H., Gur, S., Wolf, L.: Generic attention-model explainability for interpreting bi-modal and encoder-decoder transformers. In: ICCV, pp.\u00a0397\u2013406 (2021)","DOI":"10.1109\/ICCV48922.2021.00045"},{"key":"11_CR21","doi-asserted-by":"crossref","unstructured":"Fong, R., Patrick, M., Vedaldi, A.: Understanding deep networks via extremal perturbations and smooth masks. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00304"},{"key":"11_CR22","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., Torralba, A.: Learning deep features for discriminative localization. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.319"},{"key":"11_CR23","unstructured":"Petsiuk, V., Das, A., Saenko, K.: Rise: randomized input sampling for explanation of black-box models. CoRR (2018)"},{"key":"11_CR24","unstructured":"Shitole, V., Li, F., Kahng, M., Tadepalli, P., Fern, A.: One explanation is not enough: structured attention graphs for image classification. In: NeurIPS (2021)"},{"key":"11_CR25","unstructured":"Simonyan, K., Vedaldi, A., Zisserman, A.: Deep inside convolutional networks: visualising image classification models and saliency maps. CoRR (2013)"},{"key":"11_CR26","unstructured":"Goyal, Y., Wu, Z., Ernst, J., Batra, D., Parikh, D., Lee, S.: Counterfactual visual explanations. In: ICML (2019)"},{"key":"11_CR27","unstructured":"Prabhu, V., Yenamandra, S., Chattopadhyay, P., Hoffman, J.: LANCE: stress-testing visual models by generating language-guided counterfactual images. In: NeurIPS (2024)"},{"key":"11_CR28","doi-asserted-by":"publisher","unstructured":"Vandenhende, S., Mahajan, D., Radenovic, F., Ghadiyaram, D.: Making heads or tails: towards semantically consistent visual counterfactuals. In: Avidan, S., Brostow, G., Cisse, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022. ECCV 2022. LNCS, vol. 13672, pp. 261\u2013279. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19775-8_16","DOI":"10.1007\/978-3-031-19775-8_16"},{"key":"11_CR29","doi-asserted-by":"crossref","unstructured":"Wang, P., Vasconcelos, N.: Scout: self-aware discriminant counterfactual explanations. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00900"},{"key":"11_CR30","unstructured":"Yeh, C.-K., Kim, J., Yen, I.E.-H., Ravikumar, P.K.: Representer point selection for explaining deep neural networks. In: NeurIPS (2018)"},{"key":"11_CR31","unstructured":"Tsai, C.-P., Yeh, C.-K., Ravikumar, P.: Sample based explanations via generalized representers. CoRR (2023)"},{"key":"11_CR32","unstructured":"Sui, Y., Wu, G., Sanner, S.: Representer point selection via local Jacobian expansion for post-hoc classifier explanation of deep neural networks and ensemble models. In: NeurIPS (2021)"},{"key":"11_CR33","unstructured":"Pruthi, G., Liu, F., Sundararajan, M., Kale, S.: Estimating training data influence by tracking gradient descent. CoRR (2020)"},{"key":"11_CR34","unstructured":"Koh, P.W., Liang, P.: Understanding black-box predictions via influence functions. In: ICML (2017)"},{"key":"11_CR35","unstructured":"Silva, A., Chopra, R., Gombolay, M.C.: Cross-loss influence functions to explain deep network representations. In: AISTATS (2020)"},{"key":"11_CR36","doi-asserted-by":"crossref","unstructured":"Guo, H., Rajani, N., Hase, P., Bansal, M., Xiong, C.: FastIF: scalable influence functions for efficient model interpretation and debugging. CoRR (2020)","DOI":"10.18653\/v1\/2021.emnlp-main.808"},{"key":"11_CR37","unstructured":"Gandelsman, Y., Efros, A.A., Steinhardt, J.: Interpreting clip\u2019s image representation via text-based decomposition (2023)"},{"key":"11_CR38","doi-asserted-by":"crossref","unstructured":"Dravid, A., Gandelsman, Y., Efros, A.A., Shocher, A.: Rosetta neurons: mining the common units in a model zoo. In: CVPR (2023)","DOI":"10.1109\/ICCV51070.2023.00185"},{"key":"11_CR39","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"818","DOI":"10.1007\/978-3-319-10590-1_53","volume-title":"Computer Vision \u2013 ECCV 2014","author":"MD Zeiler","year":"2014","unstructured":"Zeiler, M.D., Fergus, R.: Visualizing and understanding convolutional networks. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8689, pp. 818\u2013833. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10590-1_53"},{"key":"11_CR40","unstructured":"Frankle, J., Carbin, M.: The lottery ticket hypothesis: training pruned neural networks. CoRR (2018)"},{"key":"11_CR41","unstructured":"Nguyen, A.M., Dosovitskiy, A., Yosinski, J., Brox, T., Clune, J.: Synthesizing the preferred inputs for neurons in neural networks via deep generator networks. In: NeurIPS (2016)"},{"key":"11_CR42","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp.\u00a04904\u20134916. PMLR (2021)"},{"key":"11_CR43","doi-asserted-by":"crossref","unstructured":"Cherti, M., et al.: Reproducible scaling laws for contrastive language-image learning. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00276"},{"key":"11_CR44","unstructured":"Tsimpoukelli, M., Menick, J.L., Cabi, S., Eslami, S., Vinyals, O., Hill, F.: Multimodal few-shot learning with frozen language models. In: NeurIPS (2021)"},{"key":"11_CR45","unstructured":"Alayrac, J.-B., et\u00a0al.: Flamingo: a visual language model for few-shot learning. In: NeurIPS (2022)"},{"key":"11_CR46","unstructured":"Wang, Z., Yu, J., Yu, A.W., Dai, Z., Tsvetkov, Y., Cao, Y.: SimVLM: simple visual language model pretraining with weak supervision. CoRR (2021)"},{"key":"11_CR47","doi-asserted-by":"crossref","unstructured":"Desai, K., Johnson, J.: VirTex: learning visual representations from textual annotations. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01101"},{"key":"11_CR48","doi-asserted-by":"crossref","unstructured":"Chen, J., Guo, H., Yi, K., Li, B., Elhoseiny, M.: VisualGPT: data-efficient adaptation of pretrained language models for image captioning. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01750"},{"key":"11_CR49","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.-J., Chang, K.-W.: VisualBERT: a simple and performant baseline for vision and language. CoRR (2019)"},{"key":"11_CR50","doi-asserted-by":"crossref","unstructured":"Xu, X., Wu, C., Rosenman, S., Lal, V., Che, W., Duan, N.: BridgeTower: building bridges between encoders in vision-language representation learning. In: AAAI (2023)","DOI":"10.1609\/aaai.v37i9.26263"},{"key":"11_CR51","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: LXMERT: learning cross-modality encoder representations from transformers. CoRR (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"11_CR52","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: NeurIPS (2019)"},{"key":"11_CR53","unstructured":"Zeng, Y., Zhang, X., Li, H.: Multi-grained vision language pre-training: aligning texts with visual concepts. CoRR (2021)"},{"key":"11_CR54","unstructured":"Yuan, L., et\u00a0al.: Florence: a new foundation model for computer vision. CoRR (2021)"},{"key":"11_CR55","doi-asserted-by":"crossref","unstructured":"Li, L.H., et\u00a0al.: Grounded language-image pre-training. In: CVPR (2022)","DOI":"10.1109\/CVPR52729.2023.02240"},{"key":"11_CR56","unstructured":"Li, L., Dou, Z.-Y., Peng, N., Chang, K.-W.: DesCo: learning object recognition with rich language descriptions. In: NeurIPS (2024)"},{"key":"11_CR57","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: ICML (2022)"},{"key":"11_CR58","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. CoRR (2023)"},{"key":"11_CR59","unstructured":"Mokady, R., Hertz, A., Bermano, A.H.: ClipCap: clip prefix for image captioning. CoRR (2021)"},{"key":"11_CR60","unstructured":"Luo, Z., Xi, Y., Zhang, R., Ma, J.: A frustratingly simple approach for end-to-end image captioning. CoRR (2022)"},{"key":"11_CR61","doi-asserted-by":"crossref","unstructured":"Pratt, S., Covert, I., Liu, R., Farhadi, A.: What does a platypus look like? Generating customized prompts for zero-shot image classification. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp.\u00a015691\u201315701 (2023)","DOI":"10.1109\/ICCV51070.2023.01438"},{"key":"11_CR62","doi-asserted-by":"crossref","unstructured":"Yan, A., et al.: Learning concise and descriptive attributes for visual recognition. In: CVPR (2023)","DOI":"10.1109\/ICCV51070.2023.00287"},{"key":"11_CR63","doi-asserted-by":"crossref","unstructured":"Romera-Paredes, B., et\u00a0al.: Mathematical discoveries from program search with large language models. Nature (2023)","DOI":"10.1038\/s41586-023-06924-6"},{"key":"11_CR64","unstructured":"Yu, S., Liu, S., Lin, Z., Pathak, D., Ramanan, D.: Language models as black-box optimizers for vision-language models. CoRR (2023)"},{"key":"11_CR65","unstructured":"Jin, S., Jiang, X., Huang, J., Lu, L., Lu, S.: LLMs meet VLMs: boost open vocabulary object detection with fine-grained descriptors. CoRR (2024)"},{"key":"11_CR66","unstructured":"Han, S., Zhuo, L., Liao, Y., Liu, S.: LLMs as visual explainers: advancing image classification with evolving visual descriptions. CoRR (2023)"},{"key":"11_CR67","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. CoRR (2023)"},{"issue":"12","key":"11_CR68","first-page":"3","volume":"8","author":"VS Ramachandran","year":"2001","unstructured":"Ramachandran, V.S., Hubbard, E.M.: Synaesthesia-a window into perception, thought and language. J. Conscious. Stud. 8(12), 3\u201334 (2001)","journal-title":"J. Conscious. Stud."},{"key":"11_CR69","unstructured":"Mann, B., et\u00a0al.: Language models are few-shot learners. CoRR (2020)"},{"key":"11_CR70","doi-asserted-by":"crossref","unstructured":"Melville, P., Mooney, R.J.: Diverse ensembles for active learning. In: Proceedings of the Twenty-First International Conference on Machine Learning, p.\u00a074 (2004)","DOI":"10.1145\/1015330.1015385"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73039-9_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:20:14Z","timestamp":1730301614000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73039-9_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031730382","9783031730399"],"references-count":70,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73039-9_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}