{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T18:49:24Z","timestamp":1743101364478,"version":"3.40.3"},"publisher-location":"Cham","reference-count":36,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031533013"},{"type":"electronic","value":"9783031533020"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-53302-0_16","type":"book-chapter","created":{"date-parts":[[2024,1,28]],"date-time":"2024-01-28T09:02:09Z","timestamp":1706432529000},"page":"219-233","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Mutant Texts: A Technique for\u00a0Uncovering Unexpected Inconsistencies in\u00a0Large-Scale Vision-Language Models"],"prefix":"10.1007","author":[{"given":"Mingliang","family":"Liang","sequence":"first","affiliation":[]},{"given":"Zhouran","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Martha","family":"Larson","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,1,29]]},"reference":[{"issue":"5","key":"16_CR1","doi-asserted-by":"publisher","first-page":"e13291","DOI":"10.1111\/cogs.13291","volume":"47","author":"KS Brown","year":"2023","unstructured":"Brown, K.S., et al.: Investigating the extent to which distributional semantic models capture a broad range of semantic relations. Cogn. Sci. 47(5), e13291 (2023)","journal-title":"Cogn. Sci."},{"unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.E.: A simple framework for contrastive learning of visual representations. In: ICML, vol. 119, pp. 1597\u20131607 (2020)","key":"16_CR2"},{"doi-asserted-by":"crossref","unstructured":"Cherti, M., et al.: Reproducible scaling laws for contrastive language-image learning. In: CVPR, pp. 2818\u20132829 (2023)","key":"16_CR3","DOI":"10.1109\/CVPR52729.2023.00276"},{"unstructured":"Dosovitskiy, A., et al.: An image is worth 16\u00a0$$\\times $$\u00a016 words: transformers for image recognition at scale. In: ICLR (2021)","key":"16_CR4"},{"doi-asserted-by":"crossref","unstructured":"Dou, Z., et al.: An empirical study of training end-to-end vision-and-language transformers. In: CVPR, pp. 18145\u201318155 (2022)","key":"16_CR5","DOI":"10.1109\/CVPR52688.2022.01763"},{"unstructured":"Faghri, F., Fleet, D.J., Kiros, J.R., Fidler, S.: VSE++: improving visual-semantic embeddings with hard negatives. In: BMVC, p. 12 (2018)","key":"16_CR6"},{"unstructured":"Frome, A., et al.: DeViSE: a deep visual-semantic embedding model. In: Burges, C.J.C., Bottou, L., Ghahramani, Z., Weinberger, K.Q. (eds.) NeurIPS, pp. 2121\u20132129 (2013)","key":"16_CR7"},{"doi-asserted-by":"crossref","unstructured":"Gui, L., Wang, B., Huang, Q., Hauptmann, A., Bisk, Y., Gao, J.: KAT: a knowledge augmented transformer for vision-and-language. In: The Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 956\u2013968 (2022)","key":"16_CR8","DOI":"10.18653\/v1\/2022.naacl-main.70"},{"doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.B.: Momentum contrast for unsupervised visual representation learning. In: CVPR, pp. 9726\u20139735 (2020)","key":"16_CR9","DOI":"10.1109\/CVPR42600.2020.00975"},{"unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: Meila, M., Zhang, T. (eds.) ICML, vol. 139, pp. 4904\u20134916 (2021)","key":"16_CR10"},{"doi-asserted-by":"crossref","unstructured":"Karpathy, A., Li, F.: Deep visual-semantic alignments for generating image descriptions. In: CVPR, pp. 3128\u20133137 (2015)","key":"16_CR11","DOI":"10.1109\/CVPR.2015.7298932"},{"unstructured":"Kim, W., Son, B., Kim, I.: ViLT: vision-and-language transformer without convolution or region supervision. In: Meila, M., Zhang, T. (eds.) ICML, vol. 139, pp. 5583\u20135594 (2021)","key":"16_CR12"},{"unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.C.H.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: Chaudhuri, K., Jegelka, S., Song, L., Szepesv\u00e1ri, C., Niu, G., Sabato, S. (eds.) ICML, vol. 162, pp. 12888\u201312900 (2022)","key":"16_CR13"},{"unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: VisualBERT: a simple and performant baseline for vision and language. arXiv preprint (2019)","key":"16_CR14"},{"doi-asserted-by":"crossref","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: What does BERT with vision look at? In: ACL, pp. 5265\u20135275 (2020)","key":"16_CR15","DOI":"10.18653\/v1\/2020.acl-main.469"},{"key":"16_CR16","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Li","year":"2020","unstructured":"Li, X., et al.: Oscar: object-semantics aligned pre-training for vision-language tasks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 121\u2013137. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8"},{"key":"16_CR17","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"issue":"11","key":"16_CR18","first-page":"2579","volume":"9","author":"L Van der Maaten","year":"2008","unstructured":"Van der Maaten, L., Hinton, G.: Visualizing data using t-SNE. J. Mach. Learn. Res. 9(11), 2579\u20132605 (2008)","journal-title":"J. Mach. Learn. Res."},{"unstructured":"Mikolov, T., Chen, K., Corrado, G., Dean, J.: Efficient estimation of word representations in vector space. In: ICLR (2013)","key":"16_CR19"},{"unstructured":"Mikolov, T., Sutskever, I., Chen, K., Corrado, G.S., Dean, J.: Distributed representations of words and phrases and their compositionality. In: NeurIPS, pp. 3111\u20133119 (2013)","key":"16_CR20"},{"unstructured":"van den Oord, A., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint (2018)","key":"16_CR21"},{"doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.: GloVe: global vectors for word representation. In: EMNLP, pp. 1532\u20131543 (2014)","key":"16_CR22","DOI":"10.3115\/v1\/D14-1162"},{"unstructured":"Qiu, J., et al.: Are multimodal models robust to image and text perturbations? arXiv preprint (2022)","key":"16_CR23"},{"unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Meila, M., Zhang, T. (eds.) ICML, vol. 139, pp. 8748\u20138763 (2021)","key":"16_CR24"},{"unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with CLIP latents. arXiv preprint (2022)","key":"16_CR25"},{"doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR (2022)","key":"16_CR26","DOI":"10.1109\/CVPR52688.2022.01042"},{"doi-asserted-by":"crossref","unstructured":"Schroff, F., Kalenichenko, D., Philbin, J.: FaceNet: a unified embedding for face recognition and clustering. In: CVPR, pp. 815\u2013823 (2015)","key":"16_CR27","DOI":"10.1109\/CVPR.2015.7298682"},{"unstructured":"Schuhmann, C., et al.: LAION-5B: an open large-scale dataset for training next generation image-text models. In: NeurIPS, vol. 35, pp. 25278\u201325294 (2022)","key":"16_CR28"},{"unstructured":"Schuhmann, C., et al.: LAION-400M: open dataset of CLIP-filtered 400 million image-text pairs. arXiv preprint: abs\/2111.02114 (2021)","key":"16_CR29"},{"doi-asserted-by":"crossref","unstructured":"Shtedritski, A., Rupprecht, C., Vedaldi, A.: What does CLIP know about a red circle? Visual prompt engineering for VLMs. In: ICCV, pp. 11987\u201311997 (2023)","key":"16_CR30","DOI":"10.1109\/ICCV51070.2023.01101"},{"key":"16_CR31","doi-asserted-by":"publisher","first-page":"64","DOI":"10.1145\/2812802","volume":"59","author":"B Thomee","year":"2016","unstructured":"Thomee, B., et al.: YFCC100M: the new data in multimedia research. Commun. ACM 59, 64\u201373 (2016)","journal-title":"Commun. ACM"},{"key":"16_CR32","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1007\/s10994-010-5198-3","volume":"81","author":"J Weston","year":"2010","unstructured":"Weston, J., Bengio, S., Usunier, N.: Large scale image annotation: learning to rank with joint word-image embeddings. Mach. Learn. 81, 21\u201335 (2010)","journal-title":"Mach. Learn."},{"doi-asserted-by":"crossref","unstructured":"Wolfe, R., Banaji, M.R., Caliskan, A.: Evidence for hypodescent in visual semantic AI. In: ACM Conference on Fairness, Accountability, and Transparency (2022)","key":"16_CR33","DOI":"10.1145\/3531146.3533185"},{"unstructured":"Yasunaga, M., et al.: Retrieval-augmented multimodal language modeling. arXiv preprint (2022)","key":"16_CR34"},{"key":"16_CR35","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguist. 2, 67\u201378 (2014)","journal-title":"Trans. Assoc. Comput. Linguist."},{"doi-asserted-by":"crossref","unstructured":"Zhai, X., et al.: LiT: zero-shot transfer with locked-image text tuning. In: CVPR (2022)","key":"16_CR36","DOI":"10.1109\/CVPR52688.2022.01759"}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-53302-0_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,7]],"date-time":"2024-03-07T11:58:29Z","timestamp":1709812709000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-53302-0_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031533013","9783031533020"],"references-count":36,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-53302-0_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"29 January 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"MMM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multimedia Modeling","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Amsterdam","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"The Netherlands","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 January 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 February 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mmm2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"ConfTool Pro","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"297","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"112","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"38% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.2","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.2","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}