{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T09:30:44Z","timestamp":1780392644639,"version":"3.54.1"},"publisher-location":"Cham","reference-count":32,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729829","type":"print"},{"value":"9783031729836","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:00:00Z","timestamp":1730160000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:00:00Z","timestamp":1730160000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72983-6_18","type":"book-chapter","created":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T09:34:20Z","timestamp":1730108060000},"page":"310-325","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":101,"title":["Long-CLIP: Unlocking the\u00a0Long-Text Capability of\u00a0CLIP"],"prefix":"10.1007","author":[{"given":"Beichen","family":"Zhang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Pan","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaoyi","family":"Dong","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yuhang","family":"Zang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jiaqi","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,10,29]]},"reference":[{"key":"18_CR1","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., Soricut, R.: Conceptual 12M: pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"18_CR2","doi-asserted-by":"crossref","unstructured":"Chen, L., et al.: Sharegpt4v: Improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793 (2023)","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"18_CR3","unstructured":"Chen, S., Wong, S., Chen, L., Tian, Y.: Extending context window of large language models via positional interpolation. CoRR abs\/2306.15595 (2023)"},{"key":"18_CR4","doi-asserted-by":"crossref","unstructured":"Crowson, K., et al.: VQGAN-CLIP: open domain image generation and editing with natural language guidance. In: Avidan, S., Brostow, G.J., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV. LNCS, vol. 13697, pp. 88\u2013105. Springer (2022)","DOI":"10.1007\/978-3-031-19836-6_6"},{"key":"18_CR5","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: CVPR, pp. 248\u2013255 (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"18_CR6","unstructured":"Frans, K., Soros, L.B., Witkowski, O.: Clipdraw: exploring text-to-drawing synthesis through language-image encoders. In: Koyejo, S., Mohamed, S., Agarwal, A., Belgrave, D., Cho, K., Oh, A. (eds.) NeurIPS (2022)"},{"key":"18_CR7","unstructured":"Gu, X., Lin, T., Kuo, W., Cui, Y.: Open-vocabulary object detection via vision and language knowledge distillation. In: ICLR. OpenReview.net (2022)"},{"key":"18_CR8","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., Zhao, K., Basart, S., Steinhardt, J., Song, D.: Natural adversarial examples. In: CVPR, pp. 15262\u201315271. Computer Vision Foundation\/IEEE (2021)","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"18_CR9","doi-asserted-by":"crossref","unstructured":"Kim, D., Angelova, A., Kuo, W.: Region-aware pretraining for open-vocabulary object detection with vision transformers. In: CVPR, pp. 11144\u201311154. IEEE (2023)","DOI":"10.1109\/CVPR52729.2023.01072"},{"issue":"1","key":"18_CR10","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123(1), 32\u201373 (2017)","journal-title":"Int. J. Comput. Vis."},{"key":"18_CR11","unstructured":"Krizhevsky, A., Hinton, G., et\u00a0al.: Learning multiple layers of features from tiny images (2009)"},{"key":"18_CR12","unstructured":"Li, B., Weinberger, K.Q., Belongie, S.J., Koltun, V., Ranftl, R.: Language-driven semantic segmentation. In: ICLR. OpenReview.net (2022)"},{"key":"18_CR13","unstructured":"Li, L.H., et al.: Grounded language-image pre-training. CoRR abs\/2112.03857 (2021)"},{"key":"18_CR14","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"18_CR15","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1016\/j.neucom.2022.07.028","volume":"508","author":"H Luo","year":"2022","unstructured":"Luo, H., et al.: Clip4clip: an empirical study of CLIP for end to end video clip retrieval and captioning. Neurocomputing 508, 293\u2013304 (2022)","journal-title":"Neurocomputing"},{"key":"18_CR16","doi-asserted-by":"crossref","unstructured":"Luo, Z., et al.: Lexlip: Lexicon-bottlenecked language-image pre-training for large-scale image-text sparse retrieval. In: IEEE\/CVF International Conference on Computer Vision, ICCV 2023, Paris, France, October 1-6, 2023, pp. 11172\u201311183. IEEE (2023)","DOI":"10.1109\/ICCV51070.2023.01029"},{"key":"18_CR17","unstructured":"OpenAI: GPT-4 technical report. CoRR abs\/2303.08774 (2023)"},{"key":"18_CR18","unstructured":"Ordonez, V., Kulkarni, G., Berg, T.L.: Im2text: describing images using 1 million captioned photographs. In: Shawe-Taylor, J., Zemel, R.S., Bartlett, P.L., Pereira, F.C.N., Weinberger, K.Q. (eds.) NeruIPS, pp. 1143\u20131151 (2011)"},{"key":"18_CR19","unstructured":"Podell, D., et al.: SDXL: improving latent diffusion models for high-resolution image synthesis. CoRR abs\/2307.01952 (2023)"},{"key":"18_CR20","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Meila, M., Zhang, T. (eds.) ICML. Proceedings of Machine Learning Research, vol.\u00a0139, pp. 8748\u20138763. PMLR (2021)"},{"key":"18_CR21","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with CLIP latents. CoRR abs\/2204.06125 (2022)"},{"key":"18_CR22","unstructured":"Recht, B., Roelofs, R., Schmidt, L., Shankar, V.: Do imagenet classifiers generalize to imagenet? In: Chaudhuri, K., Salakhutdinov, R. (eds.) ICML. Proceedings of Machine Learning Research, vol.\u00a097, pp. 5389\u20135400. PMLR (2019)"},{"key":"18_CR23","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models (2021)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"18_CR24","unstructured":"Schuhmann, C., et al.: LAION-5B: an open large-scale dataset for training next generation image-text models. In: Koyejo, S., Mohamed, S., Agarwal, A., Belgrave, D., Cho, K., Oh, A. (eds.) NeruIPS (2022)"},{"key":"18_CR25","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127063","volume":"568","author":"J Su","year":"2024","unstructured":"Su, J., Ahmed, M.H.M., Lu, Y., Pan, S., Bo, W., Liu, Y.: Roformer: enhanced transformer with rotary position embedding. Neurocomputing 568, 127063 (2024)","journal-title":"Neurocomputing"},{"key":"18_CR26","unstructured":"Sun, Z., et al.: Alpha-clip: A CLIP model focusing on wherever you want. CoRR abs\/2312.03818 (2023)"},{"key":"18_CR27","doi-asserted-by":"crossref","unstructured":"Tang, Y., Yamada, Y., Zhang, Y., Yildirim, I.: When are lemons purple? the concept association bias of vision-language models. In: EMNLP, pp. 14333\u201314348. Association for Computational Linguistics (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.886"},{"key":"18_CR28","doi-asserted-by":"crossref","unstructured":"Vinker, Y., et al.: Clipasso: semantically-aware object sketching. ACM Trans. Graph. 41(4), 86:1\u201386:11 (2022)","DOI":"10.1145\/3528223.3530068"},{"key":"18_CR29","doi-asserted-by":"crossref","unstructured":"Xu, H., et al.: Videoclip: contrastive pre-training for zero-shot video-text understanding. In: Moens, M., Huang, X., Specia, L., Yih, S.W. (eds.) EMNLP, pp. 6787\u20136800. Association for Computational Linguistics (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"18_CR30","doi-asserted-by":"crossref","unstructured":"Xu, J., et al.: Groupvit: semantic segmentation emerges from text supervision. In: CVPR, pp. 18113\u201318123. IEEE (2022)","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"18_CR31","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguistics 2, 67\u201378 (2014)","journal-title":"Trans. Assoc. Comput. Linguistics"},{"key":"18_CR32","unstructured":"Zeng, Y., Zhang, X., Li, H.: Multi-grained vision language pre-training: Aligning texts with visual concepts. arXiv preprint arXiv:2111.08276 (2021)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72983-6_18","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T10:36:26Z","timestamp":1732962986000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72983-6_18"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,29]]},"ISBN":["9783031729829","9783031729836"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72983-6_18","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,29]]},"assertion":[{"value":"29 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}