{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T00:43:09Z","timestamp":1774312989976,"version":"3.50.1"},"publisher-location":"Cham","reference-count":120,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031733963","type":"print"},{"value":"9783031733970","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73397-0_18","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T19:08:44Z","timestamp":1730574524000},"page":"304-323","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":59,"title":["MM1: Methods, Analysis and\u00a0Insights from\u00a0Multimodal LLM Pre-training"],"prefix":"10.1007","author":[{"given":"Brandon","family":"McKinzie","sequence":"first","affiliation":[]},{"given":"Zhe","family":"Gan","sequence":"additional","affiliation":[]},{"given":"Jean-Philippe","family":"Fauconnier","sequence":"additional","affiliation":[]},{"given":"Sam","family":"Dodge","sequence":"additional","affiliation":[]},{"given":"Bowen","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Philipp","family":"Dufter","sequence":"additional","affiliation":[]},{"given":"Dhruti","family":"Shah","sequence":"additional","affiliation":[]},{"given":"Xianzhi","family":"Du","sequence":"additional","affiliation":[]},{"given":"Futang","family":"Peng","sequence":"additional","affiliation":[]},{"given":"Anton","family":"Belyi","sequence":"additional","affiliation":[]},{"given":"Haotian","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Karanjeet","family":"Singh","sequence":"additional","affiliation":[]},{"given":"Doug","family":"Kang","sequence":"additional","affiliation":[]},{"given":"Hongyu","family":"H\u00e8","sequence":"additional","affiliation":[]},{"given":"Max","family":"Schwarzer","sequence":"additional","affiliation":[]},{"given":"Tom","family":"Gunter","sequence":"additional","affiliation":[]},{"given":"Xiang","family":"Kong","sequence":"additional","affiliation":[]},{"given":"Aonan","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Jianyu","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Chong","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Nan","family":"Du","sequence":"additional","affiliation":[]},{"given":"Tao","family":"Lei","sequence":"additional","affiliation":[]},{"given":"Sam","family":"Wiseman","sequence":"additional","affiliation":[]},{"given":"Mark","family":"Lee","sequence":"additional","affiliation":[]},{"given":"Zirui","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Ruoming","family":"Pang","sequence":"additional","affiliation":[]},{"given":"Peter","family":"Grasch","sequence":"additional","affiliation":[]},{"given":"Alexander","family":"Toshev","sequence":"additional","affiliation":[]},{"given":"Yinfei","family":"Yang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"key":"18_CR1","unstructured":"Achiam, J., et\u00a0al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"18_CR2","doi-asserted-by":"crossref","unstructured":"Agrawal, H., et al.: Nocaps: novel object captioning at scale. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00904"},{"key":"18_CR3","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning (2022)"},{"key":"18_CR4","unstructured":"Awadalla, A., et al.: Openflamingo: an open-source framework for training large autoregressive vision-language models. arXiv preprint arXiv:2308.01390 (2023)"},{"key":"18_CR5","unstructured":"Bai, J., et al.: Qwen-VL: a frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)"},{"key":"18_CR6","unstructured":"Bommasani, R., et\u00a0al.: On the opportunities and risks of foundation models. arXiv preprint arXiv:2108.07258 (2021)"},{"key":"18_CR7","unstructured":"Brown, T., et\u00a0al.: Language models are few-shot learners. In: NeurIPS (2020)"},{"key":"18_CR8","unstructured":"Byeon, M., Park, B., Kim, H., Lee, S., Baek, W., Kim, S.: COYO-700M: image-text pair dataset (2022). https:\/\/github.com\/kakaobrain\/coyo-dataset"},{"key":"18_CR9","doi-asserted-by":"crossref","unstructured":"Cha, J., Kang, W., Mun, J., Roh, B.: Honeybee: locality-enhanced projector for multimodal LLM. arXiv preprint arXiv:2312.06742 (2023)","DOI":"10.1109\/CVPR52733.2024.01311"},{"key":"18_CR10","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., Soricut, R.: Conceptual 12M: pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"18_CR11","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: unleashing multimodal LLM\u2019s referential dialogue magic. arXiv preprint arXiv:2306.15195 (2023)"},{"key":"18_CR12","doi-asserted-by":"crossref","unstructured":"Chen, L., et al.: ShareGPT4V: improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793 (2023)","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"18_CR13","doi-asserted-by":"crossref","unstructured":"Chen, T., et al.: AdaMV-MoE: adaptive multi-task vision mixture-of-experts. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01591"},{"key":"18_CR14","unstructured":"Chen, X., et\u00a0al.: PaLI-X: on scaling up a multilingual vision and language model. arXiv preprint arXiv:2305.18565 (2023)"},{"key":"18_CR15","unstructured":"Chen, X., et al.: Microsoft COCO captions: data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)"},{"key":"18_CR16","unstructured":"Chowdhery, A., et\u00a0al.: PaLM: scaling language modeling with pathways. JMLR 24(240), 1\u2013113 (2023)"},{"key":"18_CR17","unstructured":"Chu, X., et\u00a0al.: MobileVLM: a fast, reproducible and strong vision language assistant for mobile devices. arXiv preprint arXiv:2312.16886 (2023)"},{"key":"18_CR18","unstructured":"Chung, H.W., et\u00a0al.: Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)"},{"key":"18_CR19","doi-asserted-by":"crossref","unstructured":"Dai, D., et al.: DeepSeekMoE: towards ultimate expert specialization in mixture-of-experts language models. arXiv preprint arXiv:2401.06066 (2024)","DOI":"10.18653\/v1\/2024.acl-long.70"},{"key":"18_CR20","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning (2023)"},{"key":"18_CR21","unstructured":"Daxberger, E., et al.: Mobile V-MoEs: scaling down vision transformers via sparse mixture-of-experts (2023)"},{"key":"18_CR22","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"18_CR23","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16 x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"18_CR24","unstructured":"Driess, D., et\u00a0al.: PaLM-E: an embodied multimodal language model. arXiv preprint arXiv:2303.03378 (2023)"},{"key":"18_CR25","unstructured":"Du, N., et al.: GLaM: efficient scaling of language models with mixture-of-experts. In: ICML (2022)"},{"key":"18_CR26","unstructured":"El-Nouby, A., et al.: Scalable pre-training of large autoregressive image models. arXiv preprint arXiv:2401.08541 (2024)"},{"key":"18_CR27","unstructured":"Fang, A., Jose, A.M., Jain, A., Schmidt, L., Toshev, A., Shankar, V.: Data filtering networks. arXiv preprint arXiv:2309.17425 (2023)"},{"key":"18_CR28","unstructured":"Fedus, W., Zoph, B., Shazeer, N.: Switch transformers: scaling to trillion parameter models with simple and efficient sparsity (2022)"},{"key":"18_CR29","unstructured":"Fu, C., et\u00a0al.: MME: a comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394 (2023)"},{"key":"18_CR30","unstructured":"Fu, T.J., et al.: Guiding instruction-based image editing via multimodal large language models. arXiv preprint arXiv:2309.17102 (2023)"},{"key":"18_CR31","unstructured":"Gao, P., et\u00a0al.: SPHINX-X: scaling data and parameters for a family of multi-modal large language models. arXiv preprint arXiv:2402.05935 (2024)"},{"key":"18_CR32","unstructured":"Gong, T., et al.: Multimodal-GPT: a vision and language model for dialogue with humans. arXiv preprint arXiv:2305.04790 (2023)"},{"key":"18_CR33","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the V in VQA matter: elevating the role of image understanding in visual question answering. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.670"},{"key":"18_CR34","doi-asserted-by":"crossref","unstructured":"Gurari, D., et al.: VizWiz grand challenge: answering visual questions from blind people. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00380"},{"key":"18_CR35","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"18_CR36","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"18_CR37","unstructured":"He, M., et al.: Efficient multimodal learning from data-centric perspective. arXiv preprint arXiv:2402.11530 (2024)"},{"key":"18_CR38","unstructured":"Henighan, T., et\u00a0al.: Scaling laws for autoregressive generative modeling. arXiv preprint arXiv:2010.14701 (2020)"},{"key":"18_CR39","unstructured":"Hoffmann, J., et al.: Training compute-optimal large language models (2022)"},{"key":"18_CR40","unstructured":"Huang, S., et al.: Language is not all you need: aligning perception with language models (2023)"},{"key":"18_CR41","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for real-world visual reasoning and compositional question answering. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"18_CR42","unstructured":"IDEFICS: Introducing IDEFICS: an open reproduction of state-of-the-art visual language model (2023). https:\/\/huggingface.co\/blog\/idefics"},{"key":"18_CR43","unstructured":"Isik, B., Ponomareva, N., Hazimeh, H., Paparas, D., Vassilvitskii, S., Koyejo, S.: Scaling laws for downstream task performance of large language models (2024)"},{"key":"18_CR44","unstructured":"Jiang, A.Q., et al.: Mixtral of experts (2024)"},{"key":"18_CR45","doi-asserted-by":"crossref","unstructured":"Kafle, K., Price, B., Cohen, S., Kanan, C.: DVQA: understanding data visualizations via question answering. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00592"},{"key":"18_CR46","doi-asserted-by":"crossref","unstructured":"Kembhavi, A., Salvato, M., Kolve, E., Seo, M., Hajishirzi, H., Farhadi, A.: A diagram is worth a dozen images. In: ECCV (2016)","DOI":"10.1007\/978-3-319-46493-0_15"},{"key":"18_CR47","doi-asserted-by":"crossref","unstructured":"Kim, G., et al.: OCR-free document understanding transformer. In: ECCV (2022)","DOI":"10.1007\/978-3-031-19815-1_29"},{"key":"18_CR48","unstructured":"Koh, J.Y., Fried, D., Salakhutdinov, R.: Generating images with multimodal language models. arXiv preprint arXiv:2305.17216 (2023)"},{"key":"18_CR49","unstructured":"Komatsuzaki, A., et al.: Sparse upcycling: training mixture-of-experts from dense checkpoints. In: ICLR (2023)"},{"key":"18_CR50","doi-asserted-by":"crossref","unstructured":"Lai, X., et al.: LISA: reasoning segmentation via large language model. arXiv preprint arXiv:2308.00692 (2023)","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"18_CR51","unstructured":"Lai, Z., et\u00a0al.: From scarcity to efficiency: improving clip training via visual-enriched captions. arXiv preprint arXiv:2310.07699 (2023)"},{"key":"18_CR52","unstructured":"Lauren\u00e7on, H., et al.: OBELICS: an open web-scale filtered dataset of interleaved image-text documents (2023)"},{"key":"18_CR53","unstructured":"Lepikhin, D., et al.: GShard: scaling giant models with conditional computation and automatic sharding. In: ICLR (2021)"},{"key":"18_CR54","unstructured":"Li, B., et al.: MIMIC-IT: multi-modal in-context instruction tuning. arXiv preprint arXiv:2306.05425 (2023)"},{"key":"18_CR55","unstructured":"Li, B., Zhang, Y., Chen, L., Wang, J., Yang, J., Liu, Z.: Otter: a multi-modal model with in-context instruction tuning. arXiv preprint arXiv:2305.03726 (2023)"},{"key":"18_CR56","doi-asserted-by":"crossref","unstructured":"Li, B., Wang, R., Wang, G., Ge, Y., Ge, Y., Shan, Y.: Seed-bench: benchmarking multimodal LLMs with generative comprehension. arXiv preprint arXiv:2307.16125 (2023)","DOI":"10.1109\/CVPR52733.2024.01263"},{"key":"18_CR57","doi-asserted-by":"crossref","unstructured":"Li, C., et al.: Multimodal foundation models: from specialists to general-purpose assistants. arXiv preprint arXiv:2309.10020 (2023)","DOI":"10.1561\/9781638283379"},{"key":"18_CR58","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"18_CR59","unstructured":"Li, L., et\u00a0al.: $$M^3$$it: a large-scale dataset towards multi-modal multilingual instruction tuning. arXiv preprint arXiv:2306.04387 (2023)"},{"key":"18_CR60","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: VisualBERT: a simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)"},{"key":"18_CR61","doi-asserted-by":"crossref","unstructured":"Li, Y., Du, Y., Zhou, K., Wang, J., Zhao, W.X., Wen, J.R.: Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"18_CR62","doi-asserted-by":"crossref","unstructured":"Li, Z., et al.: Monkey: image resolution and text label are important things for large multi-modal models. arXiv preprint arXiv:2311.06607 (2023)","DOI":"10.1109\/CVPR52733.2024.02527"},{"key":"18_CR63","unstructured":"Lin, B., et al.: MoE-LLaVA: mixture of experts for large vision-language models (2024)"},{"key":"18_CR64","unstructured":"Lin, J., et al.: VILA: on pre-training for visual language models. arXiv preprint arXiv:2312.07533 (2023)"},{"key":"18_CR65","doi-asserted-by":"crossref","unstructured":"Lin, T., et al.: Microsoft COCO: common objects in context. arXiv preprint arXiv:1405.0312 (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"18_CR66","doi-asserted-by":"crossref","unstructured":"Lin, Z., et\u00a0al.: SPHINX: the joint mixing of weights, tasks, and visual embeddings for multi-modal large language models. arXiv preprint arXiv:2311.07575 (2023)","DOI":"10.1007\/978-3-031-73033-7_3"},{"key":"18_CR67","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"18_CR68","unstructured":"Liu, H., et al.: LLAVA-NeXT: improved reasoning, OCR, and world knowledge (2024). https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next\/"},{"key":"18_CR69","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning (2023)"},{"key":"18_CR70","unstructured":"Liu, S., et\u00a0al.: LLAVA-Plus: learning to use tools for creating multimodal agents. arXiv preprint arXiv:2311.05437 (2023)"},{"key":"18_CR71","doi-asserted-by":"crossref","unstructured":"Liu, Y., et\u00a0al.: MMBench: is your multi-modal model an all-around player? arXiv preprint arXiv:2307.06281 (2023)","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"18_CR72","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: NeurIPS (2019)"},{"key":"18_CR73","unstructured":"Lu, P., et al.: MathVista: evaluating mathematical reasoning of foundation models in visual contexts. arXiv preprint arXiv:2310.02255 (2023)"},{"key":"18_CR74","unstructured":"Lu, P., et al.: Learn to explain: multimodal reasoning via thought chains for science question answering. In: NeurIPS (2022)"},{"key":"18_CR75","doi-asserted-by":"crossref","unstructured":"Marino, K., Rastegari, M., Farhadi, A., Mottaghi, R.: OK-VQA: a visual question answering benchmark requiring external knowledge. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00331"},{"key":"18_CR76","doi-asserted-by":"crossref","unstructured":"Masry, A., Long, D.X., Tan, J.Q., Joty, S., Hoque, E.: ChartQA: a benchmark for question answering about charts with visual and logical reasoning. arXiv preprint arXiv:2203.10244 (2022)","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"18_CR77","doi-asserted-by":"crossref","unstructured":"Mathew, M., Bagal, V., Tito, R., Karatzas, D., Valveny, E., Jawahar, C.: InfographicVQA. In: WACV (2022)","DOI":"10.1109\/WACV51458.2022.00264"},{"key":"18_CR78","doi-asserted-by":"crossref","unstructured":"Mathew, M., Karatzas, D., Jawahar, C.: DocVQA: a dataset for VQA on document images. In: WACV (2021)","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"18_CR79","doi-asserted-by":"crossref","unstructured":"Mishra, A., Shekhar, S., Singh, A.K., Chakraborty, A.: OCR-VQA: visual question answering by reading text in images. In: ICDAR (2019)","DOI":"10.1109\/ICDAR.2019.00156"},{"key":"18_CR80","unstructured":"Mustafa, B., Ruiz, C.R., Puigcerver, J., Jenatton, R., Houlsby, N.: Multimodal contrastive learning with LIMoE: the language-image mixture of experts. In: NeurIPS (2022)"},{"key":"18_CR81","unstructured":"Oquab, M., et\u00a0al.: DINOv2: learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)"},{"key":"18_CR82","unstructured":"Peng, Z., et al.: KOSMOS-2: grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824 (2023)"},{"key":"18_CR83","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"18_CR84","unstructured":"Rae, J.W., et\u00a0al.: Scaling language models: methods, analysis and insights from training gopher. arXiv preprint arXiv:2112.11446 (2021)"},{"key":"18_CR85","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. JMLR 21(140), 1\u201367 (2020)"},{"key":"18_CR86","doi-asserted-by":"crossref","unstructured":"Ranasinghe, K., McKinzie, B., Ravi, S., Yang, Y., Toshev, A., Shlens, J.: Perceptual grouping in contrastive vision-language models. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00513"},{"key":"18_CR87","doi-asserted-by":"crossref","unstructured":"Rao, Y., et al.: DenseCLIP: language-guided dense prediction with context-aware prompting. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"18_CR88","unstructured":"Ruiz, C.R., et al.: Scaling vision with sparse mixture of experts. In: Beygelzimer, A., Dauphin, Y., Liang, P., Vaughan, J.W. (eds.) NeurIPS (2021)"},{"key":"18_CR89","doi-asserted-by":"crossref","unstructured":"Schwenk, D., Khandelwal, A., Clark, C., Marino, K., Mottaghi, R.: A-OKVQA: a benchmark for visual question answering using world knowledge. In: ECCV (2022)","DOI":"10.1007\/978-3-031-20074-8_9"},{"key":"18_CR90","unstructured":"Shao, Z., Ouyang, X., Yu, Z., Yu, J.: Imp: an empirical study of multimodal small language models (2024). https:\/\/huggingface.co\/MILVLG\/imp-v1-3b"},{"key":"18_CR91","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: ACL (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"18_CR92","unstructured":"Shoeybi, M., Patwary, M., Puri, R., LeGresley, P., Casper, J., Catanzaro, B.: Megatron-LM: training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)"},{"key":"18_CR93","doi-asserted-by":"crossref","unstructured":"Sidorov, O., Hu, R., Rohrbach, M., Singh, A.: TextCaps: a dataset for image captioning with reading comprehension. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58536-5_44"},{"key":"18_CR94","doi-asserted-by":"crossref","unstructured":"Singh, A., et al.: Towards VQA models that can read. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00851"},{"key":"18_CR95","unstructured":"Sun, Q., et\u00a0al.: Generative multimodal models are in-context learners. arXiv preprint arXiv:2312.13286 (2023)"},{"key":"18_CR96","unstructured":"Team, G., et\u00a0al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"18_CR97","unstructured":"Thoppilan, R., et\u00a0al.: LaMDA: language models for dialog applications. arXiv preprint arXiv:2201.08239 (2022)"},{"key":"18_CR98","doi-asserted-by":"crossref","unstructured":"Tong, S., Liu, Z., Zhai, Y., Ma, Y., LeCun, Y., Xie, S.: Eyes wide shut? Exploring the visual shortcomings of multimodal LLMs. arXiv preprint arXiv:2401.06209 (2024)","DOI":"10.1109\/CVPR52733.2024.00914"},{"key":"18_CR99","unstructured":"Touvron, H., et\u00a0al.: Llama: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"18_CR100","unstructured":"Tsimpoukelli, M., Menick, J.L., Cabi, S., Eslami, S., Vinyals, O., Hill, F.: Multimodal few-shot learning with frozen language models. In: NeurIPS (2021)"},{"key":"18_CR101","doi-asserted-by":"crossref","unstructured":"Wang, F., Mei, J., Yuille, A.: SCLIP: rethinking self-attention for dense vision-language inference. arXiv preprint arXiv:2312.01597 (2023)","DOI":"10.1007\/978-3-031-72664-4_18"},{"key":"18_CR102","unstructured":"Wang, J., Meng, L., Weng, Z., He, B., Wu, Z., Jiang, Y.G.: To see is to believe: prompting GPT-4V for better visual instruction tuning. arXiv preprint arXiv:2311.07574 (2023)"},{"key":"18_CR103","unstructured":"Wang, W., et\u00a0al.: CogVLM: visual expert for pretrained language models. arXiv preprint arXiv:2311.03079 (2023)"},{"key":"18_CR104","unstructured":"Wang, W., et\u00a0al.: VisionLLM: large language model is also an open-ended decoder for vision-centric tasks. arXiv preprint arXiv:2305.11175 (2023)"},{"key":"18_CR105","unstructured":"Wei, J., et al.: Finetuned language models are zero-shot learners. arXiv preprint arXiv:2109.01652 (2021)"},{"key":"18_CR106","unstructured":"Yang, G., Hu, E.J.: Feature learning in infinite-width neural networks. arXiv preprint arXiv:2011.14522 (2020)"},{"key":"18_CR107","unstructured":"Yang, G., et al.: Tensor programs V: tuning large neural networks via zero-shot hyperparameter transfer (2022)"},{"key":"18_CR108","unstructured":"Ye, J., et\u00a0al.: mPLUG-DocOwl: modularized multimodal large language model for document understanding. arXiv preprint arXiv:2307.02499 (2023)"},{"key":"18_CR109","unstructured":"Ye, Q., et\u00a0al.: mPLUG-Owl: modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)"},{"key":"18_CR110","doi-asserted-by":"crossref","unstructured":"Ye, Q., et al.: mPLUG-Owl2: revolutionizing multi-modal large language model with modality collaboration. arXiv preprint arXiv:2311.04257 (2023)","DOI":"10.1109\/CVPR52733.2024.01239"},{"key":"18_CR111","unstructured":"You, H., et al.: FERRET: refer and ground anything anywhere at any granularity. In: ICLR (2024)"},{"key":"18_CR112","unstructured":"Yu, W., et al.: MM-Vet: evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490 (2023)"},{"key":"18_CR113","doi-asserted-by":"crossref","unstructured":"Yue, X., et\u00a0al.: MMMU: a massive multi-discipline multimodal understanding and reasoning benchmark for expert AGI. arXiv preprint arXiv:2311.16502 (2023)","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"18_CR114","doi-asserted-by":"crossref","unstructured":"Zhang, H., et\u00a0al.: LLaVa-grounding: grounded visual chat with large multimodal models. arXiv preprint arXiv:2312.02949 (2023)","DOI":"10.1007\/978-3-031-72775-7_2"},{"key":"18_CR115","unstructured":"Zhang, S., et\u00a0al.: OPT: open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)"},{"key":"18_CR116","unstructured":"Zhao, B., Wu, B., Huang, T.: SViT: scaling up visual instruction tuning. arXiv preprint arXiv:2307.04087 (2023)"},{"key":"18_CR117","unstructured":"Zhou, B., et al.: TinyLLAVA: a framework of small-scale large multimodal models. arXiv preprint arXiv:2402.14289 (2024)"},{"key":"18_CR118","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"},{"key":"18_CR119","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Zhu, M., Liu, N., Ou, Z., Mou, X., Tang, J.: LLaVa-Phi: efficient multi-modal assistant with small language model. arXiv preprint arXiv:2401.02330 (2024)","DOI":"10.1145\/3688863.3689575"},{"key":"18_CR120","unstructured":"Zoph, B., et al.: ST-MoE: designing stable and transferable sparse expert models (2022)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73397-0_18","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T18:23:54Z","timestamp":1732991034000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73397-0_18"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031733963","9783031733970"],"references-count":120,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73397-0_18","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}