{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T12:42:56Z","timestamp":1765370576890,"version":"3.40.3"},"publisher-location":"Cham","reference-count":80,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031728471"},{"type":"electronic","value":"9783031728488"}],"license":[{"start":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T00:00:00Z","timestamp":1732838400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T00:00:00Z","timestamp":1732838400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72848-8_14","type":"book-chapter","created":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T13:37:51Z","timestamp":1732801071000},"page":"233-251","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["SPARO: Selective Attention for\u00a0Robust and\u00a0Compositional Transformer Encodings for\u00a0Vision"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-4781-9995","authenticated-orcid":false,"given":"Ankit","family":"Vani","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9193-1908","authenticated-orcid":false,"given":"Bac","family":"Nguyen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8370-2415","authenticated-orcid":false,"given":"Samuel","family":"Lavoie","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8784-2531","authenticated-orcid":false,"given":"Ranjay","family":"Krishna","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6223-0301","authenticated-orcid":false,"given":"Aaron","family":"Courville","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,29]]},"reference":[{"key":"14_CR1","unstructured":"Aydemir, G., Xie, W., G\u00fcney, F.: Self-supervised object-centric learning for videos. arXiv preprint arXiv:2310.06907 (2023)"},{"key":"14_CR2","unstructured":"Ba, J.L., Kiros, J.R., Hinton, G.E.: Layer normalization. arXiv preprint arXiv:1607.06450 (2016)"},{"key":"14_CR3","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473 (2014)"},{"key":"14_CR4","unstructured":"Barbu, A., et al.: ObjectNet: a large-scale bias-controlled dataset for pushing the limits of object recognition models. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"14_CR5","unstructured":"Beattie, C., et\u00a0al.: DeepMind lab. arXiv preprint arXiv:1612.03801 (2016)"},{"key":"14_CR6","volume-title":"Handbook of Perception and Human Performance","author":"KR Boff","year":"1986","unstructured":"Boff, K.R., Kaufman, L., Thomas, J.P.: Handbook of Perception and Human Performance, vol. 1. Wiley, New York (1986)"},{"key":"14_CR7","unstructured":"Brady, J., Zimmermann, R.S., Sharma, Y., Sch\u00f6lkopf, B., von K\u00fcgelgen, J., Brendel, W.: Provably learning object-centric representations. arXiv preprint arXiv:2305.14229 (2023)"},{"key":"14_CR8","unstructured":"Burgess, C.P., et al.: MoNet: unsupervised scene decomposition and representation. arXiv preprint arXiv:1901.11390 (2019)"},{"key":"14_CR9","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"14_CR10","doi-asserted-by":"crossref","unstructured":"Chang, H.S., Sun, R.Y., Ricci, K., McCallum, A.: Multi-CLS BERT: an efficient alternative to traditional ensembling. arXiv preprint arXiv:2210.05043 (2022)","DOI":"10.18653\/v1\/2023.acl-long.48"},{"key":"14_CR11","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., Soricut, R.: Conceptual 12M: pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3558\u20133568 (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"14_CR12","doi-asserted-by":"crossref","unstructured":"Chen, X., Xie, S., He, K.: An empirical study of training self-supervised vision transformers. arXiv preprint arXiv:2104.02057 (2021)","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"14_CR13","doi-asserted-by":"crossref","unstructured":"Chen, Y., et al.: Revisiting multimodal representation in contrastive learning: from patch and token embeddings to finite discrete tokens. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15095\u201315104 (2023)","DOI":"10.1109\/CVPR52729.2023.01449"},{"issue":"10","key":"14_CR14","doi-asserted-by":"publisher","first-page":"1865","DOI":"10.1109\/JPROC.2017.2675998","volume":"105","author":"G Cheng","year":"2017","unstructured":"Cheng, G., Han, J., Lu, X.: Remote sensing image scene classification: benchmark and state of the art. Proc. IEEE 105(10), 1865\u20131883 (2017)","journal-title":"Proc. IEEE"},{"key":"14_CR15","doi-asserted-by":"crossref","unstructured":"Cimpoi, M., Maji, S., Kokkinos, I., Mohamed, S., Vedaldi, A.: Describing textures in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3606\u20133613 (2014)","DOI":"10.1109\/CVPR.2014.461"},{"issue":"1","key":"14_CR16","doi-asserted-by":"publisher","first-page":"319","DOI":"10.1146\/annurev.neuro.22.1.319","volume":"22","author":"CL Colby","year":"1999","unstructured":"Colby, C.L., Goldberg, M.E.: Space and attention in parietal cortex. Annu. Rev. Neurosci. 22(1), 319\u2013349 (1999)","journal-title":"Annu. Rev. Neurosci."},{"key":"14_CR17","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"14_CR18","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: North American Chapter of the Association for Computational Linguistics (2019)"},{"key":"14_CR19","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16 $$\\times $$ 16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"14_CR20","unstructured":"Dugas, E., Jared, Jorge, Cukierski, W.: Diabetic retinopathy detection (2015). https:\/\/kaggle.com\/competitions\/diabetic-retinopathy-detection"},{"key":"14_CR21","unstructured":"Engelcke, M., Kosiorek, A.R., Jones, O.P., Posner, I.: Genesis: generative scene inference and sampling with object-centric latent representations. arXiv preprint arXiv:1907.13052 (2019)"},{"key":"14_CR22","unstructured":"Eslami, S., et\u00a0al.: Attend, infer, repeat: fast scene understanding with generative models. In: Advances in Neural Information Processing Systems, vol. 29 (2016)"},{"issue":"4","key":"14_CR23","doi-asserted-by":"publisher","first-page":"594","DOI":"10.1109\/TPAMI.2006.79","volume":"28","author":"L Fei-Fei","year":"2006","unstructured":"Fei-Fei, L., Fergus, R., Perona, P.: One-shot learning of object categories. IEEE Trans. Pattern Anal. Mach. Intell. 28(4), 594\u2013611 (2006)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"11","key":"14_CR24","doi-asserted-by":"publisher","first-page":"1231","DOI":"10.1177\/0278364913491297","volume":"32","author":"A Geiger","year":"2013","unstructured":"Geiger, A., Lenz, P., Stiller, C., Urtasun, R.: Vision meets robotics: the KITTI dataset. Int. J. Robot. Res. 32(11), 1231\u20131237 (2013)","journal-title":"Int. J. Robot. Res."},{"key":"14_CR25","unstructured":"Goyal, A., et al.: Neural production systems. Adv. Neural. Inf. Process. Syst. 34, 25673\u201325687 (2021)"},{"key":"14_CR26","unstructured":"Goyal, A., et al.: Factorizing declarative and procedural knowledge in structured, dynamical environments. In: International Conference on Learning Representations (2020)"},{"key":"14_CR27","unstructured":"Goyal, A., et al.: Recurrent independent mechanisms. arXiv preprint arXiv:1909.10893 (2019)"},{"key":"14_CR28","unstructured":"Greff, K., et al.: Multi-object representation learning with iterative variational inference. In: International Conference on Machine Learning, pp. 2424\u20132433. PMLR (2019)"},{"key":"14_CR29","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"issue":"7","key":"14_CR30","doi-asserted-by":"publisher","first-page":"2217","DOI":"10.1109\/JSTARS.2019.2918242","volume":"12","author":"P Helber","year":"2019","unstructured":"Helber, P., Bischke, B., Dengel, A., Borth, D.: EuroSat: a novel dataset and deep learning benchmark for land use and land cover classification. IEEE J. Sel. Top. Appl. Earth Observ. Remote Sens. 12(7), 2217\u20132226 (2019)","journal-title":"IEEE J. Sel. Top. Appl. Earth Observ. Remote Sens."},{"key":"14_CR31","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., et\u00a0al.: The many faces of robustness: a critical analysis of out-of-distribution generalization. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8340\u20138349 (2021)","DOI":"10.1109\/ICCV48922.2021.00823"},{"key":"14_CR32","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., Zhao, K., Basart, S., Steinhardt, J., Song, D.: Natural adversarial examples. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15262\u201315271 (2021)","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"14_CR33","first-page":"853","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh, M., Young, P., Hockenmaier, J.: Framing image description as a ranking task: data, models and evaluation metrics. J. Arti. Intell. Res. 47, 853\u2013899 (2013)","journal-title":"J. Arti. Intell. Res."},{"key":"14_CR34","unstructured":"Hsieh, C.Y., Zhang, J., Ma, Z., Kembhavi, A., Krishna, R.: SugarCrepe: fixing hackable benchmarks for vision-language compositionality. Adv. Neural Inf. Process. Syst. (2023)"},{"key":"14_CR35","doi-asserted-by":"publisher","unstructured":"Ilharco, G., et al.: OpenCLIP (2021). https:\/\/github.com\/mlfoundations\/open_clip, https:\/\/doi.org\/10.5281\/zenodo.5143773","DOI":"10.5281\/zenodo.5143773"},{"key":"14_CR36","doi-asserted-by":"crossref","unstructured":"Johnson, J., Hariharan, B., van\u00a0der Maaten, L., Fei-Fei, L., Lawrence\u00a0Zitnick, C., Girshick, R.: CLEVR: a diagnostic dataset for compositional language and elementary visual reasoning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2901\u20132910 (2017)","DOI":"10.1109\/CVPR.2017.215"},{"key":"14_CR37","unstructured":"Krizhevsky, A., Hinton, G., et\u00a0al.: Learning multiple layers of features from tiny images. Technical report (2009)"},{"key":"14_CR38","unstructured":"LAION-AI: CLIP_benchmark open-source project (2022). https:\/\/github.com\/LAION-AI\/CLIP_benchmark"},{"key":"14_CR39","unstructured":"Lavoie, S., et al.: Simplicial embeddings in self-supervised learning and downstream classification. In: International Conference on Learning Representations (2023). https:\/\/openreview.net\/forum?id=RWtGreRpovS"},{"key":"14_CR40","doi-asserted-by":"crossref","unstructured":"LeCun, Y., Huang, F.J., Bottou, L.: Learning methods for generic object recognition with invariance to pose and lighting. In: Proceedings of the 2004 IEEE Computer Society Conference on Computer Vision and Pattern Recognition, CVPR 2004, vol.\u00a02, pp. II\u2013104. IEEE (2004)","DOI":"10.1109\/CVPR.2004.1315150"},{"key":"14_CR41","unstructured":"Lee, J., Lee, Y., Kim, J., Kosiorek, A., Choi, S., Teh, Y.W.: Set transformer: a framework for attention-based permutation-invariant neural networks. In: International Conference on Machine Learning, pp. 3744\u20133753. PMLR (2019)"},{"key":"14_CR42","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"14_CR43","first-page":"11525","volume":"33","author":"F Locatello","year":"2020","unstructured":"Locatello, F., et al.: Object-centric learning with slot attention. Adv. Neural. Inf. Process. Syst. 33, 11525\u201311538 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"14_CR44","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"14_CR45","doi-asserted-by":"crossref","unstructured":"Ma, Z., Hong, J., Gul, M.O., Gandhi, M., Gao, I., Krishna, R.: CREPE: can vision-language foundation models reason compositionally? In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10910\u201310921 (2023)","DOI":"10.1109\/CVPR52729.2023.01050"},{"key":"14_CR46","unstructured":"Mansouri, A., Hartford, J., Zhang, Y., Bengio, Y.: Object centric architectures enable efficient causal representation learning. In: The Twelfth International Conference on Learning Representations (2024). https:\/\/openreview.net\/forum?id=r9FsiXZxZt"},{"key":"14_CR47","doi-asserted-by":"publisher","first-page":"1437","DOI":"10.1016\/S0042-6989(00)00267-4","volume":"41","author":"A Martinez","year":"2001","unstructured":"Martinez, A.: Putting spatial attention on the map: timing and localization of stimulus selection processes in striate and extrastriate visual areas. Vision. Res. 41, 1437\u20131457 (2001)","journal-title":"Vision. Res."},{"key":"14_CR48","unstructured":"Martins, A., Astudillo, R.: From softmax to sparsemax: a sparse model of attention and multi-label classification. In: International Conference on Machine Learning, pp. 1614\u20131623. PMLR (2016)"},{"key":"14_CR49","unstructured":"Matthey, L., Higgins, I., Hassabis, D., Lerchner, A.: dSprites: disentanglement testing sprites dataset (2017). https:\/\/github.com\/deepmind\/dsprites-dataset\/"},{"key":"14_CR50","unstructured":"Meta Research: DINO open-source repository (2021). https:\/\/github.com\/facebookresearch\/dino"},{"key":"14_CR51","doi-asserted-by":"crossref","unstructured":"Nilsback, M.E., Zisserman, A.: Automated flower classification over a large number of classes. In: 2008 Sixth Indian Conference on Computer Vision, Graphics & Image Processing, pp. 722\u2013729. IEEE (2008)","DOI":"10.1109\/ICVGIP.2008.47"},{"issue":"11","key":"14_CR52","doi-asserted-by":"publisher","first-page":"1203","DOI":"10.1038\/nn957","volume":"5","author":"DH O\u2019Connor","year":"2002","unstructured":"O\u2019Connor, D.H., Fukui, M.M., Pinsk, M.A., Kastner, S.: Attention modulates responses in the human lateral geniculate nucleus. Nat. Neurosci. 5(11), 1203\u20131209 (2002)","journal-title":"Nat. Neurosci."},{"key":"14_CR53","doi-asserted-by":"crossref","unstructured":"Parkhi, O.M., Vedaldi, A., Zisserman, A., Jawahar, C.: Cats and dogs. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition, pp. 3498\u20133505. IEEE (2012)","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"14_CR54","unstructured":"Paszke, A., et\u00a0al.: PyTorch: an imperative style, high-performance deep learning library. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"14_CR55","doi-asserted-by":"crossref","unstructured":"Qian, R., Ding, S., Liu, X., Lin, D.: Semantics meets temporal correspondence: self-supervised object-centric learning in videos. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 16675\u201316687 (2023)","DOI":"10.1109\/ICCV51070.2023.01529"},{"key":"14_CR56","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"issue":"8","key":"14_CR57","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al.: Language models are unsupervised multitask learners. OpenAI blog 1(8), 9 (2019)","journal-title":"OpenAI blog"},{"key":"14_CR58","unstructured":"Ray, A., Radenovic, F., Dubey, A., Plummer, B.A., Krishna, R., Saenko, K.: COLA: a benchmark for compositional text-to-image retrieval. In: Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track (2023)"},{"key":"14_CR59","unstructured":"Recht, B., Roelofs, R., Schmidt, L., Shankar, V.: Do ImageNet classifiers generalize to ImageNet? In: International Conference on Machine Learning, pp. 5389\u20135400. PMLR (2019)"},{"key":"14_CR60","unstructured":"Schuhmann, C., et al.: LAION-400M: open dataset of CLIP-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)"},{"key":"14_CR61","doi-asserted-by":"crossref","unstructured":"Scott, W.A.: Cognitive complexity and cognitive flexibility. Sociometry 405\u2013414 (1962)","DOI":"10.2307\/2785779"},{"key":"14_CR62","unstructured":"Seitzer, M., et\u00a0al.: Bridging the gap to real-world object-centric learning. arXiv preprint arXiv:2209.14860 (2022)"},{"key":"14_CR63","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics, ACL 2018, Melbourne, Australia, 15\u201320 July 2018, Volume 1: Long Papers, pp. 2556\u20132565. Association for Computational Linguistics (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"14_CR64","unstructured":"Shazeer, N.: Fast transformer decoding: one write-head is all you need. arXiv preprint arXiv:1911.02150 (2019)"},{"key":"14_CR65","doi-asserted-by":"crossref","unstructured":"Thrush, T., et al.: WinoGround: probing vision and language models for visio-linguistic compositionality. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5238\u20135248 (2022)","DOI":"10.1109\/CVPR52688.2022.00517"},{"issue":"1","key":"14_CR66","doi-asserted-by":"publisher","first-page":"97","DOI":"10.1016\/0010-0285(80)90005-5","volume":"12","author":"AM Treisman","year":"1980","unstructured":"Treisman, A.M., Gelade, G.: A feature-integration theory of attention. Cogn. Psychol. 12(1), 97\u2013136 (1980)","journal-title":"Cogn. Psychol."},{"key":"14_CR67","unstructured":"UW RAIVN Lab: SugarCrepe open-source repository (2023). https:\/\/github.com\/RAIVNLab\/sugar-crepe"},{"key":"14_CR68","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"14_CR69","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"210","DOI":"10.1007\/978-3-030-00934-2_24","volume-title":"Medical Image Computing and Computer Assisted Intervention \u2013 MICCAI 2018","author":"BS Veeling","year":"2018","unstructured":"Veeling, B.S., Linmans, J., Winkens, J., Cohen, T., Welling, M.: Rotation equivariant CNNs for digital pathology. In: Frangi, A.F., Schnabel, J.A., Davatzikos, C., Alberola-L\u00f3pez, C., Fichtinger, G. (eds.) MICCAI 2018. LNCS, vol. 11071, pp. 210\u2013218. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-00934-2_24"},{"key":"14_CR70","unstructured":"Wang, H., Ge, S., Lipton, Z., Xing, E.P.: Learning robust global representations by penalizing local predictive power. Adv. Neural Inf. Process. Syst. 10506\u201310518 (2019)"},{"key":"14_CR71","doi-asserted-by":"crossref","unstructured":"Xiao, J., Hays, J., Ehinger, K.A., Oliva, A., Torralba, A.: Sun database: large-scale scene recognition from abbey to zoo. In: 2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition, pp. 3485\u20133492. IEEE (2010)","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"14_CR72","doi-asserted-by":"crossref","unstructured":"Xu, L., Ouyang, W., Bennamoun, M., Boussaid, F., Xu, D.: Multi-class token transformer for weakly supervised semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4310\u20134319 (2022)","DOI":"10.1109\/CVPR52688.2022.00427"},{"key":"14_CR73","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguist. 2, 67\u201378 (2014)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"14_CR74","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: CoCa: contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)"},{"key":"14_CR75","unstructured":"Yuksekgonul, M., Bianchi, F., Kalluri, P., Jurafsky, D., Zou, J.: When and why vision-language models behave like bags-of-words, and what to do about it? In: The Eleventh International Conference on Learning Representations (2023)"},{"key":"14_CR76","unstructured":"Yuval, N.: Reading digits in natural images with unsupervised feature learning. In: Proceedings of the NIPS Workshop on Deep Learning and Unsupervised Feature Learning (2011)"},{"key":"14_CR77","unstructured":"Zadaianchuk, A., Kleindessner, M., Zhu, Y., Locatello, F., Brox, T.: Unsupervised semantic segmentation with self-supervised object-centric representations. arXiv preprint arXiv:2207.05027 (2022)"},{"key":"14_CR78","unstructured":"Zhai, X., et\u00a0al.: A large-scale study of representation learning with the visual task adaptation benchmark. arXiv preprint arXiv:1910.04867 (2019)"},{"key":"14_CR79","unstructured":"Zhang, Y., Hare, J., Prugel-Bennett, A.: Deep set prediction networks. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"14_CR80","doi-asserted-by":"crossref","unstructured":"Zhao, T., et al.: VL-checklist: evaluating pre-trained vision-language models with objects, attributes and relations. arXiv preprint arXiv:2207.00221 (2022)","DOI":"10.18653\/v1\/2022.emnlp-demos.4"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72848-8_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T14:08:34Z","timestamp":1732802914000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72848-8_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,29]]},"ISBN":["9783031728471","9783031728488"],"references-count":80,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72848-8_14","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,29]]},"assertion":[{"value":"29 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}