{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,13]],"date-time":"2026-07-13T18:48:30Z","timestamp":1783968510283,"version":"3.55.0"},"publisher-location":"Cham","reference-count":57,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726484","type":"print"},{"value":"9783031726491","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72649-1_5","type":"book-chapter","created":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:01:50Z","timestamp":1727593310000},"page":"73-90","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":27,"title":["DreamLIP: Language-Image Pre-training with\u00a0Long Captions"],"prefix":"10.1007","author":[{"given":"Kecheng","family":"Zheng","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yifei","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wei","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Fan","family":"Lu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shuailei","family":"Ma","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xin","family":"Jin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wei","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yujun","family":"Shen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,9,30]]},"reference":[{"key":"5_CR1","unstructured":"Betker, J., et\u00a0al.: Improving image generation with better captions. Comput. Sci. https:\/\/cdnopenai.com\/papers\/dall-e-3. pdf 2(3), 8 (2023)"},{"key":"5_CR2","doi-asserted-by":"crossref","unstructured":"Bossard, L., Guillaumin, M., Van\u00a0Gool, L.: Food-101\u2013mining discriminative components with random forests. In: Computer Vision\u2013ECCV 2014: 13th European Conference, zurich, Switzerland, September 6\u201312, 2014, Proceedings, part VI 13, pp. 446\u2013461 (2014)","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"5_CR3","doi-asserted-by":"crossref","unstructured":"Caesar, H., Uijlings, J., Ferrari, V.: Coco-stuff: thing and stuff classes in context. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1209\u20131218 (2018)","DOI":"10.1109\/CVPR.2018.00132"},{"key":"5_CR4","doi-asserted-by":"crossref","unstructured":"Chen, L., et al.: ShareGPT4V: Improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793 (2023)","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"5_CR5","doi-asserted-by":"crossref","unstructured":"Cimpoi, M., Maji, S., Kokkinos, I., Mohamed, S., Vedaldi, A.: Describing textures in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2014)","DOI":"10.1109\/CVPR.2014.461"},{"key":"5_CR6","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning. Adv. Neural Inform. Process. Syst. 36 (2024)"},{"key":"5_CR7","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"5_CR8","doi-asserted-by":"crossref","unstructured":"Dong, X., et\u00a0al.: Maskclip: masked self-distillation advances contrastive language-image pretraining. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10995\u201311005 (2023)","DOI":"10.1109\/CVPR52729.2023.01058"},{"key":"5_CR9","first-page":"32942","volume":"35","author":"ZY Dou","year":"2022","unstructured":"Dou, Z.Y., et al.: Coarse-to-fine vision-language pre-training with fusion in the backbone. Adv. Neural Inform. Process. Syst. 35, 32942\u201332956 (2022)","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"5_CR10","unstructured":"Everingham, M., Winn, J.: The pascal visual object classes challenge 2012 (voc2012) development kit. Pattern Anal. Stat. Model. Comput. Learn., Tech. Rep 2007(1\u201345), 5 (2012)"},{"key":"5_CR11","unstructured":"Fan, L., Krishnan, D., Isola, P., Katabi, D., Tian, Y.: Improving clip training with language rewrites. Adv. Neural Inform. Process. Syst. 36 (2024)"},{"key":"5_CR12","unstructured":"Fei-Fei, L., Fergus, R., Perona, P.: Learning generative visual models from few training examples: an incremental Bayesian approach tested on 101 object categories. In: 2004 Conference on Computer Vision and Pattern Recognition Workshop (2004)"},{"key":"5_CR13","first-page":"20450","volume":"35","author":"A F\u00fcrst","year":"2022","unstructured":"F\u00fcrst, A., et al.: Cloob: Modern Hopfield networks with infoloob outperform clip. Adv. Neural Inform. Process. Syst. 35, 20450\u201320468 (2022)","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"5_CR14","doi-asserted-by":"crossref","unstructured":"Gao, Y., et al.: Softclip: Softer cross-modal alignment makes clip stronger. arXiv preprint arXiv:2303.17561 (2023)","DOI":"10.1609\/aaai.v38i3.27955"},{"key":"5_CR15","first-page":"35959","volume":"35","author":"Y Gao","year":"2022","unstructured":"Gao, Y., et al.: Pyramidclip: hierarchical feature alignment for vision-language model pretraining. Adv. Neural Inform. Process. Syst. 35, 35959\u201335970 (2022)","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"5_CR16","unstructured":"Geng, S., Yuan, J., Tian, Y., Chen, Y., Zhang, Y.: HiCLIP: contrastive language-image pretraining with hierarchy-aware attention. In: International Conference Learning Represent (2023)"},{"key":"5_CR17","unstructured":"Hammoud, H.A.A.K., Itani, H., Pizzati, F., Torr, P., Bibi, A., Ghanem, B.: Synthclip: Are we ready for a fully synthetic clip training? arXiv preprint arXiv:2402.01832 (2024)"},{"key":"5_CR18","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning (2021)"},{"key":"5_CR19","doi-asserted-by":"crossref","unstructured":"Johnson-Roberson, M., Barto, C., Mehta, R., Sridhar, S.N., Rosaen, K., Vasudevan, R.: Driving in the matrix: Can virtual worlds replace human-generated annotations for real world tasks? arXiv preprint arXiv:1610.01983 (2016)","DOI":"10.1109\/ICRA.2017.7989092"},{"key":"5_CR20","doi-asserted-by":"crossref","unstructured":"Kim, B., Jo, Y., Kim, J., Kim, S.: Misalign, contrast then distill: rethinking misalignments in language-image pre-training. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2563\u20132572 (2023)","DOI":"10.1109\/ICCV51070.2023.00242"},{"key":"5_CR21","doi-asserted-by":"crossref","unstructured":"Krause, J., Stark, M., Deng, J., Fei-Fei, L.: 3d object representations for fine-grained categorization. In: ICCVW (2013)","DOI":"10.1109\/ICCVW.2013.77"},{"key":"5_CR22","unstructured":"Krizhevsky, A., et\u00a0al.: Learning multiple layers of features from tiny images (2009)"},{"key":"5_CR23","unstructured":"Lai, Z., et\u00a0al.: From scarcity to efficiency: Improving clip training via visual-enriched captions. arXiv preprint arXiv:2310.07699 (2023)"},{"key":"5_CR24","unstructured":"Lee, J., et al.: Uniclip: Unified framework for contrastive language-image pre-training. arXiv:2209.13430 (2022)"},{"key":"5_CR25","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning (2022)"},{"key":"5_CR26","unstructured":"Li, Y., Liang, F., Zhao, L., Cui, Y., Ouyang, W., Shao, J., Yu, F., Yan, J.: Supervision exists everywhere: A data efficient contrastive language-image pre-training paradigm. arXiv:2110.05208 (2021)"},{"key":"5_CR27","doi-asserted-by":"crossref","unstructured":"Li, Y., Fan, H., Hu, R., Feichtenhofer, C., He, K.: Scaling language-image pre-training via masking. In: IEEE Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23390\u201323400 (2023)","DOI":"10.1109\/CVPR52729.2023.02240"},{"key":"5_CR28","doi-asserted-by":"crossref","unstructured":"Li, Y., Du, Y., Zhou, K., Wang, J., Zhao, W.X., Wen, J.R.: Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"5_CR29","doi-asserted-by":"publisher","unstructured":"Lin, T.Y., et al.: Microsoft coco: Common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13, pp. 740\u2013755. Springer (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"5_CR30","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"5_CR31","unstructured":"Liu, Y., et al.: Mllms-augmented visual-language representation learning. arXiv preprint arXiv:2311.18765 (2023)"},{"key":"5_CR32","first-page":"2507","volume":"35","author":"P Lu","year":"2022","unstructured":"Lu, P., et al.: Learn to explain: multimodal reasoning via thought chains for science question answering. Adv. Neural Inform. Process. Syst. 35, 2507\u20132521 (2022)","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"5_CR33","unstructured":"Maji, S., Rahtu, E., Kannala, J., Blaschko, M., Vedaldi, A.: Fine-grained visual classification of aircraft. arXiv:1306.5151 (2013)"},{"key":"5_CR34","doi-asserted-by":"crossref","unstructured":"Mottaghi, R., et al.: The role of context for object detection and semantic segmentation in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 891\u2013898 (2014)","DOI":"10.1109\/CVPR.2014.119"},{"key":"5_CR35","doi-asserted-by":"crossref","unstructured":"Mu, N., Kirillov, A., Wagner, D., Xie, S.: Slip: self-supervision meets language-image pre-training. In: European Conference on Computer Vision (2022)","DOI":"10.1007\/978-3-031-19809-0_30"},{"key":"5_CR36","doi-asserted-by":"crossref","unstructured":"Nilsback, M.E., Zisserman, A.: Automated flower classification over a large number of classes. In: Sixth Indian Conference on Computer Vision, Graphics & Image Processing (2008)","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"5_CR37","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv:1807.03748 (2018)"},{"key":"5_CR38","doi-asserted-by":"crossref","unstructured":"Parkhi, O.M., Vedaldi, A., Zisserman, A., Jawahar, C.: Cats and dogs. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition (2012)","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"5_CR39","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning (2021)"},{"key":"5_CR40","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.061251(2), 3 (2022)"},{"key":"5_CR41","doi-asserted-by":"publisher","unstructured":"Richter, S.R., Vineet, V., Roth, S., Koltun, V.: Playing for data: ground truth from computer games. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part II 14, pp. 102\u2013118. Springer (2016). https:\/\/doi.org\/10.1007\/978-3-319-46475-6_7","DOI":"10.1007\/978-3-319-46475-6_7"},{"key":"5_CR42","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: IEEE Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"5_CR43","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Association for Computational Linguistics (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"5_CR44","doi-asserted-by":"crossref","unstructured":"Singh, A., et al.: Towards VQA models that can read. In: IEEE Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8317\u20138326 (2019)","DOI":"10.1109\/CVPR.2019.00851"},{"key":"5_CR45","unstructured":"Tian, Y., Fan, L., Isola, P., Chang, H., Krishnan, D.: Stablerep: synthetic images from text-to-image models make strong visual representation learners. Adv. Neural Inform. Process. Syst. 36 (2024)"},{"key":"5_CR46","doi-asserted-by":"crossref","unstructured":"Tong, S., Liu, Z., Zhai, Y., Ma, Y., LeCun, Y., Xie, S.: Eyes wide shut? Exploring the visual shortcomings of multimodal llms. arXiv preprint arXiv:2401.06209 (2024)","DOI":"10.1109\/CVPR52733.2024.00914"},{"key":"5_CR47","unstructured":"Wu, S., Fei, H., Zhang, H., Chua, T.S.: Imagine that! abstract-to-intricate text-to-image synthesis with scene graph hallucination diffusion. Adv. Neural Inform. Process. Syst. 36 (2023)"},{"key":"5_CR48","doi-asserted-by":"crossref","unstructured":"Xiao, J., Hays, J., Ehinger, K.A., Oliva, A., Torralba, A.: Sun database: large-scale scene recognition from abbey to zoo. In: 2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (2010)","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"5_CR49","doi-asserted-by":"crossref","unstructured":"Xu, M., Zhang, Z., Wei, F., Hu, H., Bai, X.: Side adapter network for open-vocabulary semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2945\u20132954 (2023)","DOI":"10.1109\/CVPR52729.2023.00288"},{"key":"5_CR50","doi-asserted-by":"crossref","unstructured":"Yang, J., et al.: Unified contrastive learning in image-text-label space. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19163\u201319173 (2022)","DOI":"10.1109\/CVPR52688.2022.01857"},{"key":"5_CR51","doi-asserted-by":"crossref","unstructured":"Yang, K., et al.: Alip: adaptive language-image pre-training with synthetic caption. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2922\u20132931 (2023)","DOI":"10.1109\/ICCV51070.2023.00273"},{"key":"5_CR52","unstructured":"Yao, L., et al.: Filip: fine-grained interactive language-image pre-training. arXiv:2111.07783 (2021)"},{"key":"5_CR53","doi-asserted-by":"crossref","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguis. pp. 67\u201378 (2014)","DOI":"10.1162\/tacl_a_00166"},{"key":"5_CR54","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)"},{"key":"5_CR55","unstructured":"Yuan, J., Zhang, J., Sun, S., Torr, P., Zhao, B.: Real-fake: Effective training data synthesis through distribution matching. arXiv preprint arXiv:2310.10402 (2023)"},{"key":"5_CR56","unstructured":"Zhao, L., Zheng, K., Zheng, Y., Zhao, D., Zhou, J.: RLEG: Vision-language representation learning with diffusion-based embedding generation. International Conference on Machine Learning (2023)"},{"key":"5_CR57","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., Torralba, A.: Scene parsing through ade20k dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 633\u2013641 (2017)","DOI":"10.1109\/CVPR.2017.544"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72649-1_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T21:18:58Z","timestamp":1732828738000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72649-1_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"ISBN":["9783031726484","9783031726491"],"references-count":57,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72649-1_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"30 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}