{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,10]],"date-time":"2026-02-10T05:01:01Z","timestamp":1770699661960,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":37,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819569564","type":"print"},{"value":"9789819569571","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-6957-1_26","type":"book-chapter","created":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T10:43:39Z","timestamp":1770633819000},"page":"361-374","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["On the\u00a0Brittleness of\u00a0CLIP Text Encoders"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9597-1832","authenticated-orcid":false,"given":"Allie","family":"Tran","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5389-9465","authenticated-orcid":false,"given":"Luca","family":"Rossetto","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,2,10]]},"reference":[{"key":"26_CR1","unstructured":"Abad\u00a0Rocamora, E., Schlarmann, C., Deep\u00a0Singh, N., Wu, Y., Hein, M., Cevher, V.: Robustness in both domains: clip needs a robust text encoder. arXiv e-prints pp. arXiv\u20132506 (2025)"},{"key":"26_CR2","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"26_CR3","doi-asserted-by":"crossref","unstructured":"Alhamoud, K., et al.: Vision-language models do not understand negation. In: Proceedings of the Computer Vision and Pattern Recognition Conference, pp. 29612\u201329622 (2025)","DOI":"10.1109\/CVPR52734.2025.02757"},{"key":"26_CR4","unstructured":"Awad, G., Fiscus, J., Godil, A., Diduch, L., Graham, Y., Qu\u00e9not, G.: Trecvid 2024 - evaluating video search, captioning, and activity recognition. In: Proceedings of TRECVID 2024. NIST, USA (2024)"},{"key":"26_CR5","unstructured":"Brendel, W., Bethge, M.: Approximating CNNs with bag-of-local-features models works surprisingly well on imagenet. CoRR arxiv:1904.00760 (2019)"},{"key":"26_CR6","doi-asserted-by":"crossref","unstructured":"Chen, Z., Chen, G.H., Diao, S., Wan, X., Wang, B.: On the difference of bert-style and clip-style text encoders. arXiv preprint arXiv:2306.03678 (2023)","DOI":"10.18653\/v1\/2023.findings-acl.866"},{"key":"26_CR7","doi-asserted-by":"crossref","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, vol. 1 (long and short papers), pp. 4171\u20134186 (2019)","DOI":"10.18653\/v1\/N19-1423"},{"key":"26_CR8","unstructured":"Fahim, A., Murphy, A., Fyshe, A.: It\u2019s not a modality gap: characterizing and addressing the contrastive gap. arXiv preprint arXiv:2405.18570 (2024)"},{"key":"26_CR9","doi-asserted-by":"crossref","unstructured":"Hessel, J., Schofield, A.: How effective is BERT without word ordering? Implications for language understanding and data privacy. In: Zong, C., Xia, F., Li, W., Navigli, R. (eds.) Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing, vol. 2: Short Papers, pp. 204\u2013211. Association for Computational Linguistics, Online (2021)","DOI":"10.18653\/v1\/2021.acl-short.27"},{"key":"26_CR10","doi-asserted-by":"crossref","unstructured":"Kamath, A., Hessel, J., Chang, K.W.: What\u2019s \u201cup\u201d with vision-language models? Investigating their struggle with spatial reasoning (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.568"},{"key":"26_CR11","unstructured":"Kang, R., Song, Y., Gkioxari, G., Perona, P.: Is clip ideal? No. can we fix it? Yes! (2025)"},{"key":"26_CR12","doi-asserted-by":"crossref","unstructured":"Kim, H., et al.: Fine-tuning clip text encoders with two-step paraphrasing. arXiv preprint arXiv:2402.15120 (2024)","DOI":"10.18653\/v1\/2024.findings-eacl.144"},{"key":"26_CR13","doi-asserted-by":"crossref","unstructured":"Leopold, M., Schoeffmann, K.: diveXplore at ivr4b 2024. In: 2024 International Conference on Content-Based Multimedia Indexing (CBMI), pp.\u00a01\u20137. IEEE (2024)","DOI":"10.1109\/CBMI62980.2024.10859204"},{"key":"26_CR14","unstructured":"Levi, M.Y., Gilboa, G.: The double-ellipsoid geometry of clip. arXiv preprint arXiv:2411.14517 (2024)"},{"key":"26_CR15","doi-asserted-by":"crossref","unstructured":"Lewis, M., et al.: Does clip bind concepts? Probing compositionality in large image models (2024)","DOI":"10.18653\/v1\/2024.findings-eacl.101"},{"key":"26_CR16","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"26_CR17","first-page":"17612","volume":"35","author":"VW Liang","year":"2022","unstructured":"Liang, V.W., Zhang, Y., Kwon, Y., Yeung, S., Zou, J.Y.: Mind the gap: understanding the modality gap in multi-modal contrastive representation learning. Adv. Neural. Inf. Process. Syst. 35, 17612\u201317625 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"26_CR18","unstructured":"Liu, Y., et al.: Roberta: a robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)"},{"key":"26_CR19","unstructured":"Newman, K., Wang, S., Zang, Y., Heffren, D., Sun, C.: Do pre-trained vision-language models encode object states? arXiv preprint arXiv:2409.10488 (2024)"},{"key":"26_CR20","doi-asserted-by":"crossref","unstructured":"Paiss, R., et al.: Teaching clip to count to ten. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3170\u20133180 (2023)","DOI":"10.1109\/ICCV51070.2023.00294"},{"key":"26_CR21","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PmLR (2021)"},{"key":"26_CR22","unstructured":"Rocamora, E.A., Schlarmann, C., Singh, N.D., Wu, Y., Hein, M., Cevher, V.: Robustness in both domains: clip needs a robust text encoder. arXiv preprint arXiv:2506.03355 (2025)"},{"key":"26_CR23","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"349","DOI":"10.1007\/978-3-030-05710-7_29","volume-title":"MultiMedia Modeling","author":"L Rossetto","year":"2019","unstructured":"Rossetto, L., Schuldt, H., Awad, G., Butt, A.A.: V3C \u2013 a research video collection. In: Kompatsiaris, I., Huet, B., Mezaris, V., Gurrin, C., Cheng, W.-H., Vrochidis, S. (eds.) MMM 2019. LNCS, vol. 11295, pp. 349\u2013360. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-05710-7_29"},{"key":"26_CR24","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., et al.: Laion-5b: an open large-scale dataset for training next generation image-text models. Adv. Neural. Inf. Process. Syst. 35, 25278\u201325294 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"26_CR25","doi-asserted-by":"crossref","unstructured":"Singh, A., et al.: FLAVA: a foundational language and vision alignment model. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022, New Orleans, LA, USA, 18\u201324 June 2022, pp. 15617\u201315629. IEEE (2022)","DOI":"10.1109\/CVPR52688.2022.01519"},{"key":"26_CR26","unstructured":"Singh, J., Shrivastava, I., Vatsa, M., Singh, R., Bharati, A.: Learn \u201cno\u201d to say \u201cyes\u201d better: improving vision-language models via negations. arXiv preprint arXiv:2403.20312 (2024)"},{"key":"26_CR27","unstructured":"Tejankar, A., et al,: A fistful of words: learning transferable visual models from bag-of-words supervision. CoRR arxiv:2112.13884 (2021)"},{"key":"26_CR28","doi-asserted-by":"crossref","unstructured":"Thrush, T., et al.: Winoground: probing vision and language models for visio-linguistic compositionality. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022, New Orleans, LA, USA, 18\u201324 June 2022, pp. 5228\u20135238. IEEE (2022)","DOI":"10.1109\/CVPR52688.2022.00517"},{"key":"26_CR29","doi-asserted-by":"crossref","unstructured":"Tong, S., Liu, Z., Zhai, Y., Ma, Y., LeCun, Y., Xie, S.: Eyes wide shut? Exploring the visual shortcomings of multimodal LLMs. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9568\u20139578 (2024)","DOI":"10.1109\/CVPR52733.2024.00914"},{"key":"26_CR30","doi-asserted-by":"publisher","first-page":"79342","DOI":"10.1109\/ACCESS.2024.3405638","volume":"12","author":"L Vadicamo","year":"2024","unstructured":"Vadicamo, L., et al.: Evaluating performance and trends in interactive video retrieval: insights from the 12th VBS competition. IEEE Access 12, 79342\u201379366 (2024)","journal-title":"IEEE Access"},{"key":"26_CR31","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"26_CR32","doi-asserted-by":"crossref","unstructured":"Wang, A., Singh, A., Michael, J., Hill, F., Levy, O., Bowman, S.R.: Glue: a multi-task benchmark and analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461 (2018)","DOI":"10.18653\/v1\/W18-5446"},{"key":"26_CR33","first-page":"5696","volume":"35","author":"J Wang","year":"2022","unstructured":"Wang, J., et al.: Omnivl: one foundation model for image-language and video-language tasks. Adv. Neural. Inf. Process. Syst. 35, 5696\u20135710 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"issue":"4","key":"26_CR34","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/1852102.1852106","volume":"28","author":"W Webber","year":"2010","unstructured":"Webber, W., Moffat, A., Zobel, J.: A similarity measure for indefinite rankings. ACM Trans. Inform. Syst. (TOIS) 28(4), 1\u201338 (2010)","journal-title":"ACM Trans. Inform. Syst. (TOIS)"},{"key":"26_CR35","unstructured":"Wu, A., et\u00a0al.: Llm2clip: extending the capability boundaries of clip through large language models (2025)"},{"key":"26_CR36","unstructured":"Yuksekgonul, M., Bianchi, F., Kalluri, P., Jurafsky, D., Zou, J.: When and why vision-language models behave like bags-of-words, and what to do about it? (2023)"},{"key":"26_CR37","doi-asserted-by":"crossref","unstructured":"Zhai, X., et al.: Lit: Zero-shot transfer with locked-image text tuning. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022, New Orleans, LA, USA, 18\u201324 June 2022, pp. 18102\u201318112. IEEE (2022)","DOI":"10.1109\/CVPR52688.2022.01759"}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-6957-1_26","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T10:43:44Z","timestamp":1770633824000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-6957-1_26"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819569564","9789819569571"],"references-count":37,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-6957-1_26","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"10 February 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"MMM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multimedia Modeling","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Prague","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Czech Republic","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2026","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 January 2026","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31 January 2026","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"32","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mmm2026","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/mmm2026.cz\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}