{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,31]],"date-time":"2025-12-31T12:21:55Z","timestamp":1767183715992,"version":"3.41.0"},"publisher-location":"Cham","reference-count":66,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031938054","type":"print"},{"value":"9783031938061","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-93806-1_19","type":"book-chapter","created":{"date-parts":[[2025,5,31]],"date-time":"2025-05-31T18:29:38Z","timestamp":1748716178000},"page":"250-267","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Towards Multimodal In-context Learning for\u00a0Vision and\u00a0Language Models"],"prefix":"10.1007","author":[{"given":"Sivan","family":"Doveh","sequence":"first","affiliation":[]},{"given":"Shaked","family":"Perek","sequence":"additional","affiliation":[]},{"given":"M. Jehanzeb","family":"Mirza","sequence":"additional","affiliation":[]},{"given":"Wei","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Amit","family":"Alfassy","sequence":"additional","affiliation":[]},{"given":"Assaf","family":"Arbelle","sequence":"additional","affiliation":[]},{"given":"Shimon","family":"Ullman","sequence":"additional","affiliation":[]},{"given":"Leonid","family":"Karlinsky","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,20]]},"reference":[{"key":"19_CR1","unstructured":"Alayrac, J.B., et al.: Flamingo: a Visual Language Model for Few-Shot Learning. arXiv:2204.14198 (2022)"},{"key":"19_CR2","unstructured":"Alfassy, A., et al.: Feta: towards specializing foundation models for expert task applications (2022)"},{"key":"19_CR3","unstructured":"Awadalla, A., et al.: Openflamingo: an open-source framework for training large autoregressive vision-language models. arXiv preprint arXiv:2308.01390 (2023)"},{"key":"19_CR4","unstructured":"Bai, J., et al.: Qwen-vl: a frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)"},{"key":"19_CR5","doi-asserted-by":"crossref","unstructured":"Bossard, L., Guillaumin, M., Van\u00a0Gool, L.: Food-101 \u2013 mining discriminative components with random forests. In: European Conference on Computer Vision (2014)","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"19_CR6","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR7","unstructured":"Brown, T.B., et al.: Language Models are Few-Shot Learners. arXiv:2005.14165 (2020)"},{"key":"19_CR8","unstructured":"Chen, J., Li, D.Z.X.S.X., Zhang, Z.L.P., Xiong, R.K.V.C.Y., Elhoseiny, M.: MINIGPT-V2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478 (2023)"},{"key":"19_CR9","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: unleashing multimodal LLM\u2019s referential dialogue magic. arXiv preprint arXiv:2306.15195 (2023)"},{"key":"19_CR10","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing gpt-4 with 90%* chatgpt quality (2023). https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"19_CR11","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning. In: Thirty-seventh Conference on Neural Information Processing Systems (2023)"},{"key":"19_CR12","doi-asserted-by":"crossref","unstructured":"Das, A., et al.: Visual dialog. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 326\u2013335 (2017)","DOI":"10.1109\/CVPR.2017.121"},{"key":"19_CR13","doi-asserted-by":"crossref","unstructured":"Doveh, et\u00a0al.: Teaching structured vision & language concepts to vision & language models. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00261"},{"key":"19_CR14","unstructured":"Doveh, S., et\u00a0al.: Dense and aligned captions (DAC) promote compositional reasoning in VL models. arXiv preprint arXiv:2305.19595 (2023)"},{"key":"19_CR15","unstructured":"Fu, C., et\u00a0al.: MME: a comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394 (2023)"},{"key":"19_CR16","unstructured":"Goel, S., Bansal, H., Bhatia, S., Rossi, R.A., Vinay, V., Grover, A.: CyCLIP: cyclic contrastive language-image pretraining. arXiv:2205.14459 (2022)"},{"key":"19_CR17","unstructured":"Gong, Y., Luo, H., Liu, A.H., Karlinsky, L., Glass, J.: Listen, think, and understand. arXiv preprint arXiv:2305.10790 (2023)"},{"key":"19_CR18","unstructured":"Hendrycks, D., et al.: Measuring massive multitask language understanding. In: Proceedings of the International Conference on Learning Representations (ICLR) (2021)"},{"key":"19_CR19","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: Proceedings of the ICML (2021)"},{"key":"19_CR20","unstructured":"Jiang, A.Q., et\u00a0al.: Mixtral of experts. arXiv preprint arXiv:2401.04088 (2024)"},{"key":"19_CR21","doi-asserted-by":"crossref","unstructured":"Khattak, M.U., Rasheed, H., Maaz, M., Khan, S., Khan, F.S.: Maple: multi-modal Prompt Learning. In: IEEE Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"19_CR22","unstructured":"Khosla, A., Jayadevaprakash, N., Yao, B., Fei-Fei, L.: Novel dataset for fine-grained image categorization. In: First Workshop on Fine-Grained Visual Categorization, IEEE Conference on Computer Vision and Pattern Recognition, Colorado Springs, CO (2011)"},{"key":"19_CR23","doi-asserted-by":"crossref","unstructured":"Krause, J., Stark, M., Deng, J., Fei-Fei, L.: 3D object representations for fine-grained categorization. In: Proceedings of the IEEE International Conference on Computer Vision Workshops, pp. 554\u2013561 (2013)","DOI":"10.1109\/ICCVW.2013.77"},{"key":"19_CR24","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vision 123, 32\u201373 (2017)","journal-title":"Int. J. Comput. Vision"},{"key":"19_CR25","doi-asserted-by":"crossref","unstructured":"Lampinen, A.K., et al.: Can language models learn from explanations in context? arXiv preprint arXiv:2204.02329 (2022)","DOI":"10.18653\/v1\/2022.findings-emnlp.38"},{"key":"19_CR26","unstructured":"Lauren\u00e7on, H., et al.: Obelics: an open web-scale filtered dataset of interleaved image-text documents (2023)"},{"key":"19_CR27","first-page":"3843","volume":"35","author":"A Lewkowycz","year":"2022","unstructured":"Lewkowycz, A., et al.: Solving quantitative reasoning problems with language models. Adv. Neural. Inf. Process. Syst. 35, 3843\u20133857 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR28","unstructured":"Li, B., et\u00a0al.: MIMIC-it: multi-modal in-context instruction tuning. arXiv (2023)"},{"key":"19_CR29","unstructured":"Li, B., et al.: Seed-bench-2: benchmarking multimodal large language models (2023)"},{"key":"19_CR30","doi-asserted-by":"crossref","unstructured":"Li, B., Wang, R., Wang, G., Ge, Y., Ge, Y., Shan, Y.: Seed-bench: benchmarking multimodal LLMs with generative comprehension. arXiv preprint arXiv:2307.16125 (2023)","DOI":"10.1109\/CVPR52733.2024.01263"},{"key":"19_CR31","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"19_CR32","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. arXiv:2201.12086 (2022)"},{"key":"19_CR33","unstructured":"Li, Y., et al.: Supervision Exists everywhere: a data efficient contrastive language-image pre-training paradigm. arXiv:2110.05208 (2021)"},{"key":"19_CR34","doi-asserted-by":"crossref","unstructured":"Li, Y.L., et al.: Hake: a knowledge engine foundation for human activity understanding. TPAMI (2023)","DOI":"10.1109\/TPAMI.2022.3232797"},{"key":"19_CR35","doi-asserted-by":"crossref","unstructured":"Lin, W., et al.: Match, expand and improve: Unsupervised finetuning for zero-shot action recognition with language knowledge. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00267"},{"key":"19_CR36","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"19_CR37","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"19_CR38","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Llava-next (llava 1.6) (2023). https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next\/"},{"key":"19_CR39","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeurIPS (2023)"},{"key":"19_CR40","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)"},{"key":"19_CR41","unstructured":"Lyu, C., et al.: Macaw-LLM: multi-modal language modeling with image, audio, video, and text integration. arXiv preprint arXiv:2306.09093 (2023)"},{"key":"19_CR42","doi-asserted-by":"crossref","unstructured":"Min, S., Lewis, M., Zettlemoyer, L., Hajishirzi, H.: Metaicl: learning to learn in context. arXiv preprint arXiv:2110.15943 (2021)","DOI":"10.18653\/v1\/2022.naacl-main.201"},{"key":"19_CR43","unstructured":"Mirza, M.J., Karlinsky, L., Lin, W., Possegger, H., Feris, R., Bischof, H.: TAP: targeted prompting for task adaptive generation of textual training instances for visual classification. arXiv preprint arXiv:2309.06809 (2023)"},{"key":"19_CR44","unstructured":"Mirza, M.J., et al.: LaFTer: label-free tuning of zero-shot classifier using language and unlabeled image collections. In: NeurIPS (2023)"},{"key":"19_CR45","doi-asserted-by":"publisher","unstructured":"Nilsback, M.E., Zisserman, A.: Automated flower classification over a large number of classes. In: 2008 Sixth Indian Conference on Computer Vision, Graphics & Image Processing, pp. 722\u2013729 (2008). https:\/\/doi.org\/10.1109\/ICVGIP.2008.47","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"19_CR46","unstructured":"Peng, Z., et al.: Kosmos-2: grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824 (2023)"},{"key":"19_CR47","doi-asserted-by":"crossref","unstructured":"Pham, K., et al.: Learning to predict visual attributes in the wild. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 13018\u201313028 (2021)","DOI":"10.1109\/CVPR46437.2021.01282"},{"key":"19_CR48","doi-asserted-by":"crossref","unstructured":"Pratt, S., Yatskar, M., Weihs, L., Farhadi, A., Kembhavi, A.: Grounded situation recognition. arXiv: abs\/2003.12058 (2020)","DOI":"10.1007\/978-3-030-58548-8_19"},{"key":"19_CR49","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: Proceeding of the ICML (2021)"},{"key":"19_CR50","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision. In: International Conference on Machine Learning, pp. 28492\u201328518. PMLR (2023)"},{"key":"19_CR51","unstructured":"Schuhmann, C., et al.: LAION-5b: an open large-scale dataset for training next generation image-text models. In: NeurIPS (2022)"},{"key":"19_CR52","unstructured":"Sun, Q., et\u00a0al.: Generative multimodal models are in-context learners. arXiv preprint arXiv:2312.13286 (2023)"},{"key":"19_CR53","unstructured":"Touvron, H., et\u00a0al.: LLAMA: open and efficient foundation language models. arXiv:2302.13971 (2023)"},{"key":"19_CR54","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"19_CR55","unstructured":"Wah, C., Branson, S., Welinder, P., Perona, P., Belongie, S.: Cub-200-2011 (caltech-ucsd birds-200-2011). Technical report CNS-TR-2011-001, California Institute of Technology (2011)"},{"key":"19_CR56","unstructured":"Wah, C., Branson, S., Welinder, P., Perona, P., Belongie, S.: The caltech-ucsd birds-200-2011 dataset. Technical report, California Institute of Technology (2011)"},{"key":"19_CR57","unstructured":"Wang, D., et al.: DocLLM: a layout-aware generative language model for multimodal document understanding. arXiv preprint arXiv:2401.00908 (2023)"},{"key":"19_CR58","unstructured":"Wang, W., et\u00a0al.: VisionLLM: large language model is also an open-ended decoder for vision-centric tasks. arXiv preprint arXiv:2305.11175 (2023)"},{"key":"19_CR59","first-page":"24824","volume":"35","author":"J Wei","year":"2022","unstructured":"Wei, J., et al.: Chain-of-thought prompting elicits reasoning in large language models. Adv. Neural. Inf. Process. Syst. 35, 24824\u201324837 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR60","unstructured":"Xu, H., et al.: Demystifying clip data. In: Proceedings of the ICLR (2023)"},{"key":"19_CR61","unstructured":"Yao, S., et al.: Tree of thoughts: deliberate problem solving with large language models. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"19_CR62","doi-asserted-by":"crossref","unstructured":"Zhai, X., Mustafa, B., Kolesnikov, A., Beyer, L.: Sigmoid loss for language image pre-training. arXiv preprint arXiv:2303.15343 (2023)","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"19_CR63","unstructured":"Zhao, T., et al.: Vl-checklist: evaluating pre-trained vision-language models with objects, attributes and relations. arXiv preprint arXiv:2207.00221 (2022)"},{"key":"19_CR64","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Conditional prompt learning for vision-language models. In: Proceedings of the CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"19_CR65","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Learning to prompt for vision-language models. IJCV (2022)","DOI":"10.1007\/s11263-022-01653-1"},{"key":"19_CR66","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: Minigpt-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-93806-1_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,31]],"date-time":"2025-05-31T18:29:51Z","timestamp":1748716191000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-93806-1_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031938054","9783031938061"],"references-count":66,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-93806-1_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"20 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}