{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:31:24Z","timestamp":1778081484683,"version":"3.51.4"},"publisher-location":"Cham","reference-count":48,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729454","type":"print"},{"value":"9783031729461","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,2]],"date-time":"2024-10-02T00:00:00Z","timestamp":1727827200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,2]],"date-time":"2024-10-02T00:00:00Z","timestamp":1727827200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72946-1_7","type":"book-chapter","created":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T19:02:08Z","timestamp":1727809328000},"page":"111-127","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":19,"title":["VeCLIP: Improving CLIP Training via\u00a0Visual-Enriched Captions"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2984-7913","authenticated-orcid":false,"given":"Zhengfeng","family":"Lai","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6809-0426","authenticated-orcid":false,"given":"Haotian","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bowen","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wentao","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haoping","family":"Bai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Aleksei","family":"Timofeev","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xianzhi","family":"Du","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhe","family":"Gan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiulong","family":"Shan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2772-387X","authenticated-orcid":false,"given":"Chen-Nee","family":"Chuah","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yinfei","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Meng","family":"Cao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,2]]},"reference":[{"key":"7_CR1","unstructured":"Abbas, A., Tirumala, K., Simig, D., Ganguli, S., Morcos, A.S.: Semdedup: data-efficient learning at web-scale through semantic deduplication. arXiv preprint arXiv:2303.09540 (2023)"},{"key":"7_CR2","unstructured":"Betker, J., et al.: Improving image generation with better captions. OpenAI (2023)"},{"key":"7_CR3","unstructured":"Bradbury, J., et al.: JAX: composable transformations of Python+NumPy programs. Github (2018). http:\/\/github.com\/google\/jax"},{"key":"7_CR4","unstructured":"Cao, L., et al.: Less is more: removing text-regions improves clip training efficiency and robustness. arXiv preprint arXiv:2305.05095 (2023)"},{"key":"7_CR5","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., Soricut, R.: Conceptual 12m: pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3558\u20133568 (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"7_CR6","doi-asserted-by":"crossref","unstructured":"Chen, L., et al.: Sharegpt4v: improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793 (2023)","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"7_CR7","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"issue":"10","key":"7_CR8","doi-asserted-by":"publisher","first-page":"1865","DOI":"10.1109\/jproc.2017.2675998","volume":"105","author":"G Cheng","year":"2017","unstructured":"Cheng, G., Han, J., Lu, X.: Remote sensing image scene classification: benchmark and state of the art. Proc. IEEE 105(10), 1865\u20131883 (2017). https:\/\/doi.org\/10.1109\/jproc.2017.2675998","journal-title":"Proc. IEEE"},{"key":"7_CR9","doi-asserted-by":"crossref","unstructured":"Cherti, M., et al.: Reproducible scaling laws for contrastive language-image learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2818\u20132829 (2023)","DOI":"10.1109\/CVPR52729.2023.00276"},{"key":"7_CR10","doi-asserted-by":"crossref","unstructured":"Cimpoi, M., Maji, S., Kokkinos, I., Mohamed, S., , Vedaldi, A.: Describing textures in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2014)","DOI":"10.1109\/CVPR.2014.461"},{"key":"7_CR11","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"7_CR12","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2020)"},{"key":"7_CR13","unstructured":"Fan, L., Krishnan, D., Isola, P., Katabi, D., Tian, Y.: Improving clip training with language rewrites. arXiv preprint arXiv:2305.20088 (2023)"},{"key":"7_CR14","unstructured":"Fang, A., Jose, A.M., Jain, A., Schmidt, L., Toshev, A.T., Shankar, V.: Data filtering networks. In: NeurIPS 2023 Workshop on Distribution Shifts: New Frontiers with Foundation Models (2023)"},{"key":"7_CR15","unstructured":"Fei-Fei, L., Fergus, R., Perona, P.: Learning generative visual models from few training examples: an incremental Bayesian approach tested on 101 object categories. In: 2004 Conference on Computer Vision and Pattern Recognition Workshop, pp. 178\u2013178. IEEE (2004)"},{"key":"7_CR16","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"7_CR17","doi-asserted-by":"crossref","unstructured":"Helber, P., Bischke, B., Dengel, A., Borth, D.: Introducing Eurosat: a novel dataset and deep learning benchmark for land use and land cover classification. In: IGARSS 2018-2018 IEEE International Geoscience and Remote Sensing Symposium, pp. 204\u2013207. IEEE (2018)","DOI":"10.1109\/IGARSS.2018.8519248"},{"key":"7_CR18","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916. PMLR (2021)"},{"key":"7_CR19","unstructured":"Krizhevsky, A.: Learning multiple layers of features from tiny images. Can. Inst. Adv. Res. (2009)"},{"key":"7_CR20","unstructured":"Kwon, G., Cai, Z., Ravichandran, A., Bas, E., Bhotika, R., Soatto, S.: Masked vision and language modeling for multi-modal representation learning. In: The Eleventh International Conference on Learning Representations (2023)"},{"key":"7_CR21","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"7_CR22","doi-asserted-by":"crossref","unstructured":"Li, Y., Fan, H., Hu, R., Feichtenhofer, C., He, K.: Scaling language-image pre-training via masking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23390\u201323400 (2023)","DOI":"10.1109\/CVPR52729.2023.02240"},{"key":"7_CR23","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014 Part V. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"7_CR24","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)"},{"key":"7_CR25","unstructured":"Maini, P., Goyal, S., Lipton, Z.C., Kolter, J.Z., Raghunathan, A.: T-mars: improving visual representations by circumventing text feature learning. arXiv preprint arXiv:2307.03132 (2023)"},{"key":"7_CR26","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"529","DOI":"10.1007\/978-3-031-19809-0_30","volume-title":"ECCV","author":"N Mu","year":"2022","unstructured":"Mu, N., Kirillov, A., Wagner, D., Xie, S.: Slip: self-supervision meets language-image pre-training. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV. LNCS, vol. 13686, pp. 529\u2013544. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19809-0_30"},{"key":"7_CR27","unstructured":"Nguyen, T., Gadre, S.Y., Ilharco, G., Oh, S., Schmidt, L.: Improving multimodal datasets with image captioning. arXiv preprint arXiv:2307.10350 (2023)"},{"key":"7_CR28","doi-asserted-by":"crossref","unstructured":"Nilsback, M.E., Zisserman, A.: Automated flower classification over a large number of classes. In: 2008 Sixth Indian Conference on Computer Vision, Graphics & Image Processing, pp. 722\u2013729. IEEE (2008)","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"7_CR29","doi-asserted-by":"crossref","unstructured":"Parkhi, O.M., Vedaldi, A., Zisserman, A., Jawahar, C.: Cats and dogs. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition, pp. 3498\u20133505. IEEE (2012)","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"7_CR30","unstructured":"Pham, H., et al.: Combined scaling for zero-shot transfer learning. arXiv preprint arXiv:2111.10050 (2021)"},{"key":"7_CR31","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"7_CR32","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML, pp. 8748\u20138763 (2021)"},{"key":"7_CR33","unstructured":"Recht, B., Roelofs, R., Schmidt, L., Shankar, V.: Do imagenet classifiers generalize to imagenet? In: International Conference on Machine Learning, pp. 5389\u20135400. PMLR (2019)"},{"key":"7_CR34","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"205","DOI":"10.1007\/978-3-030-98358-1_17","volume-title":"MultiMedia Modeling","author":"K Schall","year":"2022","unstructured":"Schall, K., Barthel, K.U., Hezel, N., Jung, K.: GPR1200: a benchmark for\u00a0general-purpose content-based image retrieval. In: \u00de\u00f3r J\u00f3nsson, B., et al. (eds.) MMM 2022. LNCS, vol. 13141, pp. 205\u2013216. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-030-98358-1_17"},{"key":"7_CR35","unstructured":"Schuhmann, C., et al.: Laion-5b: An open large-scale dataset for training next generation image-text models. Adv. Neural. Inf. Process. Syst. 35, 25278\u201325294 (2022)"},{"key":"7_CR36","unstructured":"Schuhmann, C., et al.: Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)"},{"key":"7_CR37","doi-asserted-by":"crossref","unstructured":"Sennrich, R., Haddow, B., Birch, A.: Improving neural machine translation models with monolingual data. arXiv preprint arXiv:1511.06709 (2015)","DOI":"10.18653\/v1\/P16-1009"},{"key":"7_CR38","unstructured":"Touvron, H., et\u00a0al.: Llama: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"7_CR39","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"210","DOI":"10.1007\/978-3-030-00934-2_24","volume-title":"Medical Image Computing and Computer Assisted Intervention \u2013 MICCAI 2018","author":"BS Veeling","year":"2018","unstructured":"Veeling, B.S., Linmans, J., Winkens, J., Cohen, T., Welling, M.: Rotation equivariant CNNs for digital pathology. In: Frangi, A.F., Schnabel, J.A., Davatzikos, C., Alberola-L\u00f3pez, C., Fichtinger, G. (eds.) MICCAI 2018 Part II. LNCS, vol. 11071, pp. 210\u2013218. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-00934-2_24"},{"key":"7_CR40","doi-asserted-by":"crossref","unstructured":"Wei, J., Zou, K.: Eda: easy data augmentation techniques for boosting performance on text classification tasks. arXiv preprint arXiv:1901.11196 (2019)","DOI":"10.18653\/v1\/D19-1670"},{"key":"7_CR41","unstructured":"Wu, W., et\u00a0al.: Mofi: learning image representations from noisy entity annotated images. arXiv preprint arXiv:2306.07952 (2023)"},{"key":"7_CR42","unstructured":"Xu, H., et al.: Demystifying clip data. arXiv preprint arXiv:2309.16671 (2023)"},{"key":"7_CR43","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: Coca: contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)"},{"key":"7_CR44","unstructured":"Yuan, L., et\u00a0al.: Florence: a new foundation model for computer vision. arXiv preprint arXiv:2111.11432 (2021)"},{"key":"7_CR45","unstructured":"Yuval, N.: Reading digits in natural images with unsupervised feature learning. In: Proceedings of the NIPS Workshop on Deep Learning and Unsupervised Feature Learning (2011)"},{"key":"7_CR46","unstructured":"Zhai, X., et\u00a0al.: A large-scale study of representation learning with the visual task adaptation benchmark. arXiv preprint arXiv:1910.04867 (2019)"},{"key":"7_CR47","doi-asserted-by":"crossref","unstructured":"Zhai, X., et al.: Lit: zero-shot transfer with locked-image text tuning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18123\u201318133 (2022)","DOI":"10.1109\/CVPR52688.2022.01759"},{"key":"7_CR48","unstructured":"Zheng, L., et al.: Judging LLM-as-a-judge with MT-bench and chatbot arena (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72946-1_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T23:35:28Z","timestamp":1732836928000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72946-1_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,2]]},"ISBN":["9783031729454","9783031729461"],"references-count":48,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72946-1_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,2]]},"assertion":[{"value":"2 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}