{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T04:31:03Z","timestamp":1774326663957,"version":"3.50.1"},"publisher-location":"Cham","reference-count":112,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730320","type":"print"},{"value":"9783031730337","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73033-7_3","type":"book-chapter","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:03:55Z","timestamp":1730333035000},"page":"36-55","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":22,"title":["SPHINX: A Mixer of\u00a0Weights, Visual Embeddings and\u00a0Image Scales for\u00a0Multi-modal Large Language Models"],"prefix":"10.1007","author":[{"given":"Ziyi","family":"Lin","sequence":"first","affiliation":[]},{"given":"Dongyang","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Renrui","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Peng","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Longtian","family":"Qiu","sequence":"additional","affiliation":[]},{"given":"Han","family":"Xiao","sequence":"additional","affiliation":[]},{"given":"Han","family":"Qiu","sequence":"additional","affiliation":[]},{"given":"Wenqi","family":"Shao","sequence":"additional","affiliation":[]},{"given":"Keqin","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Jiaming","family":"Han","sequence":"additional","affiliation":[]},{"given":"Siyuan","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Yichi","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Xuming","family":"He","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Qiao","sequence":"additional","affiliation":[]},{"given":"Hongsheng","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"3_CR1","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1007\/s11263-016-0966-6","volume":"123","author":"A Agrawal","year":"2015","unstructured":"Agrawal, A., et al.: VQA: visual question answering. Int. J. Comput. Vision 123, 4\u201331 (2015)","journal-title":"Int. J. Comput. Vision"},{"key":"3_CR2","unstructured":"Aiello, E., Yu, L., Nie, Y., Aghajanyan, A., Oguz, B.: Jointly training large autoregressive multimodal models. arXiv preprint arXiv:2309.15564 (2023)"},{"key":"3_CR3","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. In: Advances in Neural Information Processing Systems, vol. 35, pp. 23716\u201323736 (2022)"},{"key":"3_CR4","unstructured":"Bai, J., et al.: Qwen-VL: a frontier large vision-language model with versatile abilities. arXiv arXiv:2308.12966 (2023)"},{"key":"3_CR5","unstructured":"Bai, S., et al.: Touchstone: evaluating vision-language models by language models. arXiv preprint arXiv:2308.16890 (2023)"},{"key":"3_CR6","unstructured":"Bavishi, R., et al.: Introducing our multimodal models (2023). https:\/\/www.adept.ai\/blog\/fuyu-8b"},{"key":"3_CR7","unstructured":"Brown, T., et al.: Language models are few-shot learners. In: Advances in Neural Information Processing Systems, vol. 33, pp. 1877\u20131901 (2020)"},{"key":"3_CR8","doi-asserted-by":"crossref","unstructured":"Chen, J., Guo, H., Yi, K., Li, B., Elhoseiny, M.: VisualGPT: data-efficient adaptation of pretrained language models for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18030\u201318040 (2022)","DOI":"10.1109\/CVPR52688.2022.01750"},{"key":"3_CR9","unstructured":"Chen, J., Li, D.Z.X.S.X., Zhang, Z.L.P., Xiong, R.K.V.C.Y., Elhoseiny, M.: MiniGPT-V2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478 (2023)"},{"key":"3_CR10","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: unleashing multimodal LLM\u2019s referential dialogue magic. arXiv preprint arXiv:2306.15195 (2023)"},{"key":"3_CR11","unstructured":"Chen, X., et\u00a0al.: Pali: a jointly-scaled multilingual language-image model. arXiv preprint arXiv:2209.06794 (2022)"},{"key":"3_CR12","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality (2023). https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"3_CR13","unstructured":"Contributors, O.: OpenCompass: a universal evaluation platform for foundation models (2023). https:\/\/github.com\/open-compass\/opencompass"},{"key":"3_CR14","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning. arXiv arXiv:2305.06500 (2023)"},{"key":"3_CR15","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"3_CR16","unstructured":"Dong, R., et\u00a0al.: DreamLLM: synergistic multimodal comprehension and creation. arXiv preprint arXiv:2309.11499 (2023)"},{"key":"3_CR17","unstructured":"Dong, X., et al.: CLIP itself is a strong fine-tuner: achieving 85.7% and 88.0% top-1 accuracy with ViT-B and ViT-L on ImageNet. arXiv preprint arXiv:2212.06138 (2022)"},{"key":"3_CR18","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16$$\\,\\times \\,$$16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"3_CR19","unstructured":"Douillard, A., et al.: Diloco: distributed low-communication training of language models. arXiv preprint arXiv:2311.08105 (2023)"},{"key":"3_CR20","unstructured":"Fu, C., et\u00a0al.: MME: a comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394 (2023)"},{"key":"3_CR21","unstructured":"Gao, P., et\u00a0al.: LLaMA-Adapter V2: parameter-efficient visual instruction model. arXiv preprint arXiv:2304.15010 (2023)"},{"key":"3_CR22","doi-asserted-by":"crossref","unstructured":"Girdhar, R., et al.: ImageBind one embedding space to bind them all. 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15180\u201315190 (2023)","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"3_CR23","unstructured":"Google: Bard (2023). https:\/\/bard.google.com\/"},{"key":"3_CR24","unstructured":"Guo, Z., et al.: Point-bind & point-LLM: aligning point cloud with multi-modality for 3D understanding, generation, and instruction following. arXiv arXiv:2309.00615 (2023)"},{"key":"3_CR25","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., Girshick, R.: LVIS: a dataset for large vocabulary instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5356\u20135364 (2019)","DOI":"10.1109\/CVPR.2019.00550"},{"key":"3_CR26","doi-asserted-by":"crossref","unstructured":"Gurari, D., Li, Q., Stangl, A., Guo, A., Lin, C., Grauman, K., Luo, J., Bigham, J.P.: Vizwiz grand challenge: answering visual questions from blind people. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3608\u20133617 (2018)","DOI":"10.1109\/CVPR.2018.00380"},{"key":"3_CR27","unstructured":"Han, J., et\u00a0al.: ImageBind-LLM: multi-modality instruction tuning. arXiv preprint arXiv:2309.03905 (2023)"},{"key":"3_CR28","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"3_CR29","unstructured":"Hong, Y., et al.: 3D-LLM: injecting the 3D world into large language models. arXiv preprint arXiv:2307.12981 (2023)"},{"key":"3_CR30","unstructured":"Huang, C., Liu, Q., Lin, B.Y., Pang, T., Du, C., Lin, M.: LoraHub: efficient cross-task generalization via dynamic LoRA composition. arXiv preprint arXiv:2307.13269 (2023)"},{"key":"3_CR31","unstructured":"Huang, S., et\u00a0al.: Language is not all you need: aligning perception with language models. arXiv preprint arXiv:2302.14045 (2023)"},{"key":"3_CR32","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for real-world visual reasoning and compositional question answering. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6693\u20136702 (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"3_CR33","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., andre Matten, M., Berg, T.L.: ReferitGame: referring to objects in photographs of natural scenes. In: Conference on Empirical Methods in Natural Language Processing (2014)","DOI":"10.3115\/v1\/D14-1086"},{"key":"3_CR34","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. arXiv preprint arXiv:2304.02643 (2023)"},{"key":"3_CR35","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vision 123, 32\u201373 (2017)","journal-title":"Int. J. Comput. Vision"},{"key":"3_CR36","unstructured":"Li, B., et al.: MIMIC-IT: multi-modal in-context instruction tuning. arXiv arXiv:2306.05425 (2023)"},{"key":"3_CR37","unstructured":"Li, B., Zhang, Y., Chen, L., Wang, J., Yang, J., Liu, Z.: Otter: a multi-modal model with in-context instruction tuning. arXiv arXiv:2305.03726 (2023)"},{"key":"3_CR38","doi-asserted-by":"crossref","unstructured":"Li, B., Wang, R., Wang, G., Ge, Y., Ge, Y., Shan, Y.: SEED-bench: benchmarking multimodal LLMs with generative comprehension. arXiv arXiv:2307.16125 (2023)","DOI":"10.1109\/CVPR52733.2024.01263"},{"key":"3_CR39","unstructured":"Li, H., Xu, Z., Taylor, G., Studer, C., Goldstein, T.: Visualizing the loss landscape of neural nets. In: Advances in Neural Information Processing Systems, vol. 31 (2018)"},{"key":"3_CR40","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"3_CR41","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"3_CR42","unstructured":"Li, M., et al.: Branch-train-merge: embarrassingly parallel training of expert language models. arXiv preprint arXiv:2208.03306 (2022)"},{"key":"3_CR43","doi-asserted-by":"publisher","unstructured":"Li, Y., Mao, H., Girshick, R., He, K.: Exploring plain vision transformer backbones for object detection. In: European Conference on Computer Vision, pp. 280\u2013296. Springer (2022). https:\/\/doi.org\/10.1007\/978-3-031-20077-9_17","DOI":"10.1007\/978-3-031-20077-9_17"},{"key":"3_CR44","doi-asserted-by":"crossref","unstructured":"Li, Y., Du, Y., Zhou, K., Wang, J., Zhao, W.X., Wen, J.R.: Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"3_CR45","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2117\u20132125 (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"3_CR46","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"3_CR47","doi-asserted-by":"publisher","first-page":"635","DOI":"10.1162\/tacl_a_00566","volume":"11","author":"F Liu","year":"2023","unstructured":"Liu, F., Emerson, G.E.T., Collier, N.: Visual spatial reasoning. Trans. Assoc. Comput. Linguist. 11, 635\u2013651 (2023)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"3_CR48","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. arXiv arXiv:2310.03744 (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"3_CR49","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)"},{"key":"3_CR50","doi-asserted-by":"crossref","unstructured":"Liu, S., et al.: Grounding DINO: marrying DINO with grounded pre-training for open-set object detection. arXiv arXiv:2303.05499 (2023)","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"3_CR51","doi-asserted-by":"crossref","unstructured":"Liu, Y., et\u00a0al.: MMBench: is your multi-modal model an all-around player? arXiv preprint arXiv:2307.06281 (2023)","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"3_CR52","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3431\u20133440 (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"3_CR53","unstructured":"Lu, P., et al.: MathVista: evaluating math reasoning in visual contexts with GPT-4V, bard, and other large multimodal models. arXiv arXiv:2310.02255 (2023)"},{"key":"3_CR54","unstructured":"Lu, P., et al.: Learn to explain: multimodal reasoning via thought chains for science question answering. In: The 36th Conference on Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"3_CR55","unstructured":"Lu, P., et al.: IconQA: a new benchmark for abstract diagram understanding and visual language reasoning. In: The 35th Conference on Neural Information Processing Systems (NeurIPS) Track on Datasets and Benchmarks (2021)"},{"key":"3_CR56","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O.M., Yuille, A.L., Murphy, K.P.: Generation and comprehension of unambiguous object descriptions. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 11\u201320 (2015)","DOI":"10.1109\/CVPR.2016.9"},{"key":"3_CR57","doi-asserted-by":"crossref","unstructured":"Marino, K., Rastegari, M., Farhadi, A., Mottaghi, R.: OK-VQA: a visual question answering benchmark requiring external knowledge. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3190\u20133199 (2019)","DOI":"10.1109\/CVPR.2019.00331"},{"key":"3_CR58","unstructured":"McMahan, B., Moore, E., Ramage, D., Hampson, S., y\u00a0Arcas, B.A.: Communication-efficient learning of deep networks from decentralized data. In: Artificial Intelligence and Statistics, pp. 1273\u20131282. PMLR (2017)"},{"key":"3_CR59","doi-asserted-by":"crossref","unstructured":"Mishra, A., Shekhar, S., Singh, A.K., Chakraborty, A.: OCR-VQA: visual question answering by reading text in images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 947\u2013952 (2019)","DOI":"10.1109\/ICDAR.2019.00156"},{"key":"3_CR60","unstructured":"OpenAI: Chatgpt (2023). https:\/\/chat.openai.com"},{"key":"3_CR61","unstructured":"OpenAI: GPT-4 technical report. arXiv arXiv:2303.08774 (2023)"},{"key":"3_CR62","unstructured":"OpenAI: Vision - OpenAI api (2023). https:\/\/platform.openai.com\/docs\/guides\/vision"},{"key":"3_CR63","unstructured":"Oquab, M., et\u00a0al.: DINOv2: learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)"},{"key":"3_CR64","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. In: Advances in Neural Information Processing Systems, vol. 35, pp. 27730\u201327744 (2022)"},{"key":"3_CR65","unstructured":"Penedo, G., et al.: The RefinedWeb dataset for falcon LLM: outperforming curated corpora with web data, and web data only. arXiv preprint arXiv:2306.01116 (2023)"},{"key":"3_CR66","unstructured":"Peng, B., Li, C., He, P., Galley, M., Gao, J.: Instruction tuning with GPT-4. arXiv preprint arXiv:2304.03277 (2023)"},{"key":"3_CR67","unstructured":"Peng, Z., et al.: Kosmos-2: grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824 (2023)"},{"key":"3_CR68","doi-asserted-by":"publisher","first-page":"74","DOI":"10.1007\/s11263-016-0965-7","volume":"123","author":"BA Plummer","year":"2015","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. Int. J. Comput. Vision 123, 74\u201393 (2015)","journal-title":"Int. J. Comput. Vision"},{"key":"3_CR69","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"3_CR70","unstructured":"Radford, A., Narasimhan, K.: Improving language understanding by generative pre-training (2018)"},{"issue":"8","key":"3_CR71","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I.: Language models are unsupervised multitask learners. OpenAI blog 1(8), 9 (2019)","journal-title":"OpenAI blog"},{"key":"3_CR72","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2021)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"3_CR73","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., et al.: ImageNet large scale visual recognition challenge. Int. J. Comput. Vision 115, 211\u2013252 (2015)","journal-title":"Int. J. Comput. Vision"},{"key":"3_CR74","unstructured":"Schuhmann, C., K\u00f6pf, A., Vencu, R., Coombes, T., Beaumont, R.: Laion-coco (2022). https:\/\/laion.ai\/blog\/laion-coco\/"},{"key":"3_CR75","unstructured":"Schuhmann, C., et al.: LAION-400M: open dataset of clip-filtered 400 Million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)"},{"key":"3_CR76","doi-asserted-by":"publisher","unstructured":"Schwenk, D., Khandelwal, A., Clark, C., Marino, K., Mottaghi, R.: A-OKVQA: a benchmark for visual question answering using world knowledge. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) European Conference on Computer Vision. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20074-8_9","DOI":"10.1007\/978-3-031-20074-8_9"},{"key":"3_CR77","unstructured":"Shao, W., et\u00a0al.: TinyLVLM-eHub: early multimodal experiments with bard. arXiv preprint arXiv:2308.03729 (2023)"},{"key":"3_CR78","unstructured":"ShareGPT: ShareGPT (2023). https:\/\/sharegpt.com\/"},{"key":"3_CR79","unstructured":"Shukor, M., Dancette, C., Rame, A., Cord, M.: Unival: Unified model for image, video, audio and language tasks. Transactions on Machine Learning Research Journal (2023)"},{"key":"3_CR80","doi-asserted-by":"crossref","unstructured":"Sidorov, O., Hu, R., Rohrbach, M., Singh, A.: Textcaps: a dataset for image captioning with reading comprehension. arXiv arXiv:2003.12462 (2020)","DOI":"10.1007\/978-3-030-58536-5_44"},{"key":"3_CR81","doi-asserted-by":"crossref","unstructured":"Singh, A., et al.: Towards VQA models that can read. 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 8309\u20138318 (2019)","DOI":"10.1109\/CVPR.2019.00851"},{"key":"3_CR82","unstructured":"Su, Y., Lan, T., Li, H., Xu, J., Wang, Y., Cai, D.: PandaGPT: one model to instruction-follow them all. arXiv arXiv:2305.16355 (2023)"},{"key":"3_CR83","doi-asserted-by":"crossref","unstructured":"Sung, Y.L., Li, L., Lin, K., Gan, Z., Bansal, M., Wang, L.: An empirical study of multimodal model merging. arXiv preprint arXiv:2304.14933 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.105"},{"key":"3_CR84","doi-asserted-by":"crossref","unstructured":"Suvorov, R., et al.: Resolution-robust large mask inpainting with fourier convolutions. arXiv preprint arXiv:2109.07161 (2021)","DOI":"10.1109\/WACV51458.2022.00323"},{"key":"3_CR85","unstructured":"Taori, R., et al.: Stanford alpaca: an instruction-following llama model (2023). https:\/\/github.com\/tatsu-lab\/stanford_alpaca"},{"key":"3_CR86","unstructured":"Touvron, H., et al.: LLaMA: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"3_CR87","unstructured":"Touvron, H., et\u00a0al.: LLaMA 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"3_CR88","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"3_CR89","unstructured":"Wang, K., et al.: Mathcoder: seamless code integration in LLMs for enhanced mathematical reasoning. arXiv preprint arXiv:2310.03731 (2023)"},{"key":"3_CR90","unstructured":"Wang, P., et al.: OFA: unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: International Conference on Machine Learning, pp. 23318\u201323340. PMLR (2022)"},{"key":"3_CR91","unstructured":"Wang, W., et\u00a0al.: VisionLLM: large language model is also an open-ended decoder for vision-centric tasks. arXiv preprint arXiv:2305.11175 (2023)"},{"key":"3_CR92","unstructured":"Wen, S., Fang, G., Zhang, R., Gao, P., Dong, H., Metaxas, D.: Improving compositional text-to-image generation with large vision-language models. arXiv preprint arXiv:2310.06311 (2023)"},{"key":"3_CR93","doi-asserted-by":"crossref","unstructured":"Woo, S., et al.: ConvNext V2: co-designing and scaling convnets with masked autoencoders. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16133\u201316142 (2023)","DOI":"10.1109\/CVPR52729.2023.01548"},{"key":"3_CR94","unstructured":"Wortsman, M., et\u00a0al.: Model soups: averaging weights of multiple fine-tuned models improves accuracy without increasing inference time. In: International Conference on Machine Learning, pp. 23965\u201323998. PMLR (2022)"},{"key":"3_CR95","unstructured":"Wu, C., Yin, S., Qi, W., Wang, X., Tang, Z., Duan, N.: Visual ChatGPT: talking, drawing and editing with visual foundation models. arXiv preprint arXiv:2303.04671 (2023)"},{"key":"3_CR96","unstructured":"Wu, C., et al.: $$\\pi $$-tuning: transferring multimodal foundation models with optimal multi-task interpolation. In: International Conference on Machine Learning, pp. 37713\u201337727. PMLR (2023)"},{"key":"3_CR97","doi-asserted-by":"crossref","unstructured":"Xu, R., Wang, X., Wang, T., Chen, Y., Pang, J., Lin, D.: PointLLM: empowering large language models to understand point clouds. arXiv arXiv:2308.16911 (2023)","DOI":"10.1007\/978-3-031-72698-9_8"},{"key":"3_CR98","doi-asserted-by":"crossref","unstructured":"Yan, B., et al.: Universal instance perception as object discovery and retrieval. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15325\u201315336 (2023)","DOI":"10.1109\/CVPR52729.2023.01471"},{"key":"3_CR99","unstructured":"Yang, E., et al.: Adamerging: adaptive model merging for multi-task learning. arXiv preprint arXiv:2310.02575 (2023)"},{"key":"3_CR100","unstructured":"Yang, Z., et al.: MM-react: prompting ChatGPT for multimodal reasoning and action. arXiv preprint arXiv:2303.11381 (2023)"},{"key":"3_CR101","unstructured":"Ye, Q., et\u00a0al.: mPLUG-Owl: modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)"},{"key":"3_CR102","unstructured":"Yu, F., Koltun, V.: Multi-scale context aggregation by dilated convolutions. arXiv preprint arXiv:1511.07122 (2015)"},{"key":"3_CR103","unstructured":"Yu, T., et al.: Inpaint anything: segment anything meets image inpainting. arXiv preprint arXiv:2304.06790 (2023)"},{"key":"3_CR104","unstructured":"Yu, W., et al.: MM-Vet: evaluating large multimodal models for integrated capabilities. arXiv arXiv:2308.02490 (2023)"},{"key":"3_CR105","unstructured":"Zhang, R., et al.: LLaMA-Adapter: efficient fine-tuning of language models with zero-init attention. arXiv preprint arXiv:2303.16199 (2023)"},{"key":"3_CR106","doi-asserted-by":"crossref","unstructured":"Zhang, R., et al.: Prompt, generate, then cache: cascade of foundation models makes strong few-shot learners. In: CVPR 2023 (2023)","DOI":"10.1109\/CVPR52729.2023.01460"},{"key":"3_CR107","unstructured":"Zhang, S., et\u00a0al.: OPT: open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)"},{"key":"3_CR108","unstructured":"Zhao, H., et al.: MMICL: empowering vision-language model with multi-modal in-context learning. arXiv preprint arXiv:2309.07915 (2023)"},{"key":"3_CR109","doi-asserted-by":"crossref","unstructured":"Zhao, H., Shi, J., Qi, X., Wang, X., Jia, J.: Pyramid scene parsing network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2881\u20132890 (2017)","DOI":"10.1109\/CVPR.2017.660"},{"key":"3_CR110","unstructured":"Zhou, A., et\u00a0al.: Solving challenging math word problems using GPT-4 code interpreter with code-based self-verification. arXiv preprint arXiv:2308.07921 (2023)"},{"key":"3_CR111","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"},{"key":"3_CR112","doi-asserted-by":"crossref","unstructured":"Zhu, X., Zhang, R., He, B., Zeng, Z., Zhang, S., Gao, P.: PointCLIP V2: adapting clip for powerful 3D open-world learning. arXiv preprint arXiv:2211.11682 (2022)","DOI":"10.1109\/ICCV51070.2023.00249"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73033-7_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T15:07:32Z","timestamp":1732979252000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73033-7_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031730320","9783031730337"],"references-count":112,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73033-7_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}