{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T19:33:05Z","timestamp":1775763185399,"version":"3.50.1"},"publisher-location":"Cham","reference-count":94,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729942","type":"print"},{"value":"9783031729959","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,24]],"date-time":"2024-11-24T00:00:00Z","timestamp":1732406400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,24]],"date-time":"2024-11-24T00:00:00Z","timestamp":1732406400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72995-9_11","type":"book-chapter","created":{"date-parts":[[2024,11,23]],"date-time":"2024-11-23T19:16:03Z","timestamp":1732389363000},"page":"177-197","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["X-InstructBLIP: A Framework for\u00a0Aligning Image, 3D, Audio, Video to\u00a0LLMs and\u00a0its Emergent Cross-Modal Reasoning"],"prefix":"10.1007","author":[{"given":"Artemis","family":"Panagopoulou","sequence":"first","affiliation":[]},{"given":"Le","family":"Xue","sequence":"additional","affiliation":[]},{"given":"Ning","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Junnan","family":"Li","sequence":"additional","affiliation":[]},{"given":"Dongxu","family":"Li","sequence":"additional","affiliation":[]},{"given":"Shafiq","family":"Joty","sequence":"additional","affiliation":[]},{"given":"Ran","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Silvio","family":"Savarese","sequence":"additional","affiliation":[]},{"given":"Caiming","family":"Xiong","sequence":"additional","affiliation":[]},{"given":"Juan Carlos","family":"Niebles","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,24]]},"reference":[{"key":"11_CR1","doi-asserted-by":"crossref","unstructured":"Alamri, H., Hori, C., Marks, T.K., Batra, D., Parikh, D.: Audio visual scene-aware dialog (AVSD) track for natural language generation in DSTC7. In: DSTC7 at AAAI2019 Workshop, vol.\u00a02 (2018)","DOI":"10.1109\/CVPR.2019.00774"},{"key":"11_CR2","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"11_CR3","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: a joint video and image encoder for end-to-end retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1728\u20131738 (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"11_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"51","DOI":"10.1007\/978-3-030-58589-1_4","volume-title":"Computer Vision \u2013 ECCV 2020","author":"A Bansal","year":"2020","unstructured":"Bansal, A., Zhang, Y., Chellappa, R.: Visual question answering on image sets. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12366, pp. 51\u201367. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58589-1_4"},{"key":"11_CR5","doi-asserted-by":"crossref","unstructured":"Bigham, J.P., et al.: Vizwiz: nearly real-time answers to visual questions. In: Proceedings of the 23nd Annual ACM Symposium on User Interface Software and Technology, pp. 333\u2013342 (2010)","DOI":"10.1145\/1866029.1866080"},{"key":"11_CR6","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"11_CR7","unstructured":"Chen, F., et al.: X-LLM: bootstrapping advanced large language models by treating multi-modalities as foreign languages. arXiv preprint arXiv:2305.04160 (2023)"},{"key":"11_CR8","doi-asserted-by":"crossref","unstructured":"Chen, J., Guo, H., Yi, K., Li, B., Elhoseiny, M.: Visualgpt: data-efficient adaptation of pretrained language models for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18030\u201318040 (2022)","DOI":"10.1109\/CVPR52688.2022.01750"},{"key":"11_CR9","unstructured":"Chen, S., et al.: BEATs: audio pre-training with acoustic tokenizers. In: Krause, A., Brunskill, E., Cho, K., Engelhardt, B., Sabato, S., Scarlett, J. (eds.) Proceedings of the 40th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0202, pp. 5178\u20135193. PMLR (2023). https:\/\/proceedings.mlr.press\/v202\/chen23ag.html"},{"key":"11_CR10","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: SimCLR: a simple framework for contrastive learning of visual representations. In: International Conference on Learning Representations, vol.\u00a02 (2020)"},{"key":"11_CR11","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"key":"11_CR12","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* chatgpt quality (2023). https:\/\/vicuna.lmsys.org. Accessed 14 Apr 2023"},{"key":"11_CR13","unstructured":"Cho, J., Lei, J., Tan, H., Bansal, M.: Unifying vision-and-language tasks via text generation. In: International Conference on Machine Learning, pp. 1931\u20131942. PMLR (2021)"},{"key":"11_CR14","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning. In: Thirty-Seventh Conference on Neural Information Processing Systems (2023). https:\/\/openreview.net\/forum?id=vvoWPYqZJA"},{"key":"11_CR15","unstructured":"Deshmukh, S., Elizalde, B., Singh, R., Wang, H.: Pengi: an audio language model for audio tasks. In: Thirty-Seventh Conference on Neural Information Processing Systems (2023). https:\/\/openreview.net\/forum?id=gJLAfO4KUq"},{"key":"11_CR16","unstructured":"Driess, D., et al.: PaLM-e: an embodied multimodal language model. In: Krause, A., Brunskill, E., Cho, K., Engelhardt, B., Sabato, S., Scarlett, J. (eds.) Proceedings of the 40th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0202, pp. 8469\u20138488. PMLR (2023). https:\/\/proceedings.mlr.press\/v202\/driess23a.html"},{"key":"11_CR17","doi-asserted-by":"crossref","unstructured":"Drossos, K., Lipping, S., Virtanen, T.: Clotho: an audio captioning dataset. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 736\u2013740. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"11_CR18","doi-asserted-by":"crossref","unstructured":"Fang, Y., et al.: EVA: exploring the limits of masked visual representation learning at scale. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19358\u201319369 (2023)","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"11_CR19","unstructured":"Fu, C., et al.: MME: a comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394 (2023)"},{"key":"11_CR20","doi-asserted-by":"crossref","unstructured":"Girdhar, R., et al.: Imagebind: one embedding space to bind them all. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15180\u201315190 (2023)","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"11_CR21","unstructured":"Gong, Y., Luo, H., Liu, A.H., Karlinsky, L., Glass, J.R.: Listen, think, and understand. In: The Twelfth International Conference on Learning Representations (2024). https:\/\/openreview.net\/forum?id=nBZBPXdJlC"},{"key":"11_CR22","doi-asserted-by":"crossref","unstructured":"Li, G., Xu, Y., Hu, D.: Multi-scale attention for audio question answering. In: Proceedings of INTERSPEECH (2023)","DOI":"10.21437\/Interspeech.2023-1606"},{"key":"11_CR23","doi-asserted-by":"crossref","unstructured":"Gui, L., Wang, B., Huang, Q., Hauptmann, A.G., Bisk, Y., Gao, J.: Kat: a knowledge augmented transformer for vision-and-language. In: Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 956\u2013968 (2022)","DOI":"10.18653\/v1\/2022.naacl-main.70"},{"key":"11_CR24","unstructured":"Guo, Z., et al.: Point-bind & point-LLM: aligning point cloud with multi-modality for 3D understanding, generation, and instruction following. arXiv preprint arXiv:2309.00615 (2023)"},{"key":"11_CR25","doi-asserted-by":"crossref","unstructured":"Guzhov, A., Raue, F., Hees, J., Dengel, A.: Audioclip: extending clip to image, text and audio. In: ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 976\u2013980. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"11_CR26","doi-asserted-by":"crossref","unstructured":"Han, J., et al.: Onellm: one framework to align all modalities with language. arXiv preprint arXiv:2312.03700 (2023)","DOI":"10.1109\/CVPR52733.2024.02510"},{"key":"11_CR27","unstructured":"Han, J., et al.: Imagebind-LLM: multi-modality instruction tuning. arXiv preprint arXiv:2309.03905 (2023)"},{"key":"11_CR28","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"11_CR29","unstructured":"Hong, Y., et al.: 3D-LLM: injecting the 3D world into large language models. In: Thirty-Seventh Conference on Neural Information Processing Systems (2023). https:\/\/openreview.net\/forum?id=YQA28p7qNz"},{"key":"11_CR30","unstructured":"Hu, E.J., et al.: Lora: low-rank adaptation of large language models. In: International Conference on Learning Representations (2021)"},{"key":"11_CR31","unstructured":"Huang, S., et al.: Language is not all you need: aligning perception with language models. In: Thirty-Seventh Conference on Neural Information Processing Systems (2023). https:\/\/openreview.net\/forum?id=UpN2wfrLec"},{"key":"11_CR32","unstructured":"Jaegle, A., Gimeno, F., Brock, A., Vinyals, O., Zisserman, A., Carreira, J.: Perceiver: general perception with iterative attention. In: International Conference on Machine Learning, pp. 4651\u20134664. PMLR (2021)"},{"key":"11_CR33","doi-asserted-by":"publisher","unstructured":"Jiang, C., Ye, W., Xu, H., Huang, S., Huang, F., Zhang, S.: Vision language pre-training by contrastive learning with cross-modal similarity regulation. In: Rogers, A., Boyd-Graber, J., Okazaki, N. (eds.) Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), Toronto, Canada, pp. 14660\u201314679. Association for Computational Linguistics (2023). https:\/\/doi.org\/10.18653\/v1\/2023.acl-long.819. https:\/\/aclanthology.org\/2023.acl-long.819","DOI":"10.18653\/v1\/2023.acl-long.819"},{"key":"11_CR34","unstructured":"Kim, C.D., Kim, B., Lee, H., Kim, G.: Audiocaps: generating captions for audios in the wild. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 119\u2013132 (2019)"},{"key":"11_CR35","doi-asserted-by":"crossref","unstructured":"Kim, M., Sung-Bin, K., Oh, T.H.: Prefix tuning for automated audio captioning. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10096877"},{"key":"11_CR36","unstructured":"Koh, J.Y., Salakhutdinov, R., Fried, D.: Grounding language models to images for multimodal inputs and outputs. In: Krause, A., Brunskill, E., Cho, K., Engelhardt, B., Sabato, S., Scarlett, J. (eds.) Proceedings of the 40th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0202, pp. 17283\u201317300. PMLR (2023). https:\/\/proceedings.mlr.press\/v202\/koh23a.html"},{"key":"11_CR37","doi-asserted-by":"publisher","unstructured":"Li, C., et al.: mPLUG: effective and efficient vision-language learning by cross-modal skip-connections. In: Goldberg, Y., Kozareva, Z., Zhang, Y. (eds.) Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, Abu Dhabi, United Arab Emirates, pp. 7241\u20137259. Association for Computational Linguistics (2022). https:\/\/doi.org\/10.18653\/v1\/2022.emnlp-main.488. https:\/\/aclanthology.org\/2022.emnlp-main.488","DOI":"10.18653\/v1\/2022.emnlp-main.488"},{"key":"11_CR38","doi-asserted-by":"crossref","unstructured":"Li, D., Li, J., Le, H., Wang, G., Savarese, S., Hoi, S.C.: LAVIS: a one-stop library for language-vision intelligence. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations), Toronto, Canada, pp. 31\u201341. Association for Computational Linguistics (2023). https:\/\/aclanthology.org\/2023.acl-demo.3","DOI":"10.18653\/v1\/2023.acl-demo.3"},{"key":"11_CR39","doi-asserted-by":"crossref","unstructured":"Li, G., Wei, Y., Tian, Y., Xu, C., Wen, J.R., Hu, D.: Learning to answer questions in dynamic audio-visual scenarios. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19108\u201319118 (2022)","DOI":"10.1109\/CVPR52688.2022.01852"},{"key":"11_CR40","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: 40th International Conference on Machine Learning (2023)"},{"key":"11_CR41","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: Chaudhuri, K., Jegelka, S., Song, L., Szepesvari, C., Niu, G., Sabato, S. (eds.) Proceedings of the 39th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0162, pp. 12888\u201312900. PMLR (2022). https:\/\/proceedings.mlr.press\/v162\/li22n.html"},{"key":"11_CR42","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.C.H.: Align before fuse: vision and language representation learning with momentum distillation. Adv. Neural. Inf. Process. Syst. 34, 9694\u20139705 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"11_CR43","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Li","year":"2020","unstructured":"Li, X., et al.: Oscar: object-semantics aligned pre-training for vision-language tasks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 121\u2013137. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8"},{"key":"11_CR44","doi-asserted-by":"publisher","unstructured":"Li, Y., Li, W., Nie, L.: MMCoQA: conversational question answering over text, tables, and images. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), Dublin, Ireland, pp. 4220\u20134231. Association for Computational Linguistics (2022). https:\/\/doi.org\/10.18653\/v1\/2022.acl-long.290. https:\/\/aclanthology.org\/2022.acl-long.290","DOI":"10.18653\/v1\/2022.acl-long.290"},{"key":"11_CR45","first-page":"10560","volume":"35","author":"Y Lin","year":"2022","unstructured":"Lin, Y., Xie, Y., Chen, D., Xu, Y., Zhu, C., Yuan, L.: Revive: regional visual representation matters in knowledge-based visual question answering. Adv. Neural. Inf. Process. Syst. 35, 10560\u201310571 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"11_CR46","doi-asserted-by":"crossref","unstructured":"Lipping, S., Sudarsanam, P., Drossos, K., Virtanen, T.: Clotho-AQA: a crowdsourced dataset for audio question answering. In: 2022 30th European Signal Processing Conference (EUSIPCO), pp. 1140\u20131144. IEEE (2022)","DOI":"10.23919\/EUSIPCO55093.2022.9909680"},{"key":"11_CR47","unstructured":"Liu, H., Yan, W., Abbeel, P.: Language quantized autoencoders: towards unsupervised text-image alignment. In: Thirty-Seventh Conference on Neural Information Processing Systems (2023). https:\/\/openreview.net\/forum?id=mlxRLIy7kc"},{"key":"11_CR48","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Thirty-Seventh Conference on Neural Information Processing Systems (2023). https:\/\/openreview.net\/forum?id=w0H2xGHlkw"},{"key":"11_CR49","unstructured":"Liu, P.J., et al.: Generating Wikipedia by summarizing long sequences. In: International Conference on Learning Representations (2018). https:\/\/openreview.net\/forum?id=Hyg0vbWC-"},{"key":"11_CR50","doi-asserted-by":"crossref","unstructured":"Liu, S., Zhu, Z., Ye, N., Guadarrama, S., Murphy, K.: Improved image captioning via policy gradient optimization of spider. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 873\u2013881 (2017)","DOI":"10.1109\/ICCV.2017.100"},{"key":"11_CR51","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations (2018)"},{"key":"11_CR52","unstructured":"Luo, R., et al.: Valley: video assistant with large language model enhanced ability (2024). https:\/\/openreview.net\/forum?id=bjyf5FyQ0a"},{"key":"11_CR53","unstructured":"Luo, T., Rockwell, C., Lee, H., Johnson, J.: Scalable 3D captioning with pretrained models. In: Proceedings of the NeurIPS 2023 (2023)"},{"key":"11_CR54","unstructured":"Van\u00a0der Maaten, L., Hinton, G.: Visualizing data using t-SNE. J. Mach. Learn. Res. 9(11) (2008)"},{"key":"11_CR55","doi-asserted-by":"crossref","unstructured":"Maaz, M., Rasheed, H., Khan, S., Khan, F.S.: Video-chatgpt: towards detailed video understanding via large vision and language models (2023)","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"11_CR56","doi-asserted-by":"publisher","unstructured":"Ma\u00f1as, O., Rodriguez\u00a0Lopez, P., Ahmadi, S., Nematzadeh, A., Goyal, Y., Agrawal, A.: MAPL: parameter-efficient adaptation of unimodal pre-trained models for vision-language few-shot prompting. In: Vlachos, A., Augenstein, I. (eds.) Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics, Dubrovnik, Croatia, pp. 2523\u20132548. Association for Computational Linguistics (2023). https:\/\/doi.org\/10.18653\/v1\/2023.eacl-main.185. https:\/\/aclanthology.org\/2023.eacl-main.185","DOI":"10.18653\/v1\/2023.eacl-main.185"},{"key":"11_CR57","doi-asserted-by":"crossref","unstructured":"Moon, S., et al.: Anymal: an efficient and scalable any-modality augmented language model. arXiv preprint arXiv:2309.16058 (2023)","DOI":"10.18653\/v1\/2024.emnlp-industry.98"},{"key":"11_CR58","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"407","DOI":"10.1007\/978-3-031-19781-9_24","volume-title":"ECCV 2022","author":"A Nagrani","year":"2022","unstructured":"Nagrani, A., Seo, P.H., Seybold, B., Hauth, A., Manen, S., Sun, C., Schmid, C.: Learning audio-video modalities from image captions. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13674, pp. 407\u2013426. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19781-9_24"},{"key":"11_CR59","unstructured":"Najdenkoska, I., Zhen, X., Worring, M.: Meta learning to bridge vision and language models for multimodal few-shot learning. In: The Eleventh International Conference on Learning Representations (2023). https:\/\/openreview.net\/forum?id=3oWo92cQyxL"},{"key":"11_CR60","doi-asserted-by":"crossref","unstructured":"Piczak, K.J.: ESC: dataset for environmental sound classification. In: Proceedings of the 23rd ACM International Conference on Multimedia, pp. 1015\u20131018 (2015)","DOI":"10.1145\/2733373.2806390"},{"key":"11_CR61","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"11_CR62","unstructured":"Salesforce: Ulip (2022). https:\/\/github.com\/salesforce\/ULIP. Accessed 1 July 2023"},{"key":"11_CR63","doi-asserted-by":"crossref","unstructured":"Shao, Z., Yu, Z., Wang, M., Yu, J.: Prompting large language models with answer heuristics for knowledge-based visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14974\u201314983 (2023)","DOI":"10.1109\/CVPR52729.2023.01438"},{"key":"11_CR64","unstructured":"Shu, F., Zhang, L., Jiang, H., Xie, C.: Audio-visual LLM for video understanding. arXiv preprint arXiv:2312.06720 (2023)"},{"key":"11_CR65","unstructured":"Su, Y., Lan, T., Li, H., Xu, J., Wang, Y., Cai, D.: PandaGPT: one model to instruction-follow them all. In: Hazarika, D., Tang, X.R., Jin, D. (eds.) Proceedings of the 1st Workshop on Taming Large Language Models: Controllability in the era of Interactive Assistants, Prague, Czech Republic, pp. 11\u201323. Association for Computational Linguistics (2023). https:\/\/aclanthology.org\/2023.tllm-1.2"},{"key":"11_CR66","unstructured":"Sun, Q., et al.: EMU: generative pretraining in multimodality. In: The Twelfth International Conference on Learning Representations (2024). https:\/\/openreview.net\/forum?id=mL8Q9OOamV"},{"key":"11_CR67","doi-asserted-by":"crossref","unstructured":"Tanaka, R., Nishida, K., Nishida, K., Hasegawa, T., Saito, I., Saito, K.: Slidevqa: a dataset for document visual question answering on multiple images. In: AAAI (2023)","DOI":"10.1609\/aaai.v37i11.26598"},{"key":"11_CR68","unstructured":"Tang, C., et al.: SALMONN: towards generic hearing abilities for large language models. In: The Twelfth International Conference on Learning Representations (2024). https:\/\/openreview.net\/forum?id=14rn7HpKVk"},{"key":"11_CR69","first-page":"200","volume":"34","author":"M Tsimpoukelli","year":"2021","unstructured":"Tsimpoukelli, M., Menick, J.L., Cabi, S., Eslami, S., Vinyals, O., Hill, F.: Multimodal few-shot learning with frozen language models. Adv. Neural. Inf. Process. Syst. 34, 200\u2013212 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"11_CR70","doi-asserted-by":"crossref","unstructured":"Uy, M.A., Pham, Q.H., Hua, B.S., Nguyen, T., Yeung, S.K.: Revisiting point cloud classification: a new benchmark dataset and classification model on real-world data. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1588\u20131597 (2019)","DOI":"10.1109\/ICCV.2019.00167"},{"key":"11_CR71","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: Cider: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"11_CR72","unstructured":"Wang, J., et al.: GIT: a generative image-to-text transformer for vision and language. Trans. Mach. Learn. Res. (2022)"},{"key":"11_CR73","unstructured":"Wang, P., et al.: One-peace: exploring one general representation model toward unlimited modalities. arXiv preprint arXiv:2305.11172 (2023)"},{"key":"11_CR74","unstructured":"Wang, P., et al.: OFA: unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: International Conference on Machine Learning, pp. 23318\u201323340. PMLR (2022)"},{"key":"11_CR75","doi-asserted-by":"crossref","unstructured":"Wang, T., et al.: Accelerating vision-language pretraining with free language modeling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 23161\u201323170 (2023)","DOI":"10.1109\/CVPR52729.2023.02218"},{"key":"11_CR76","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: Image as a foreign language: beit pretraining for vision and vision-language tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19175\u201319186 (2023)","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"11_CR77","doi-asserted-by":"crossref","unstructured":"Wang, X., Wu, J., Chen, J., Li, L., Wang, Y.F., Wang, W.Y.: Vatex: a large-scale, high-quality multilingual dataset for video-and-language research. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00468"},{"key":"11_CR78","doi-asserted-by":"crossref","unstructured":"Wang, Z., Chen, C., Li, P., Liu, Y.: Filling the image information gap for VQA: prompting large language models to proactively ask questions. In: Findings of the Association for Computational Linguistics: EMNLP 2023, pp. 2874\u20132890 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.189"},{"key":"11_CR79","unstructured":"Wei, J., et al.: Finetuned language models are zero-shot learners. In: International Conference on Learning Representations (2022). https:\/\/openreview.net\/forum?id=gEZrGCozdqR"},{"key":"11_CR80","first-page":"24824","volume":"35","author":"J Wei","year":"2022","unstructured":"Wei, J., et al.: Chain-of-thought prompting elicits reasoning in large language models. Adv. Neural. Inf. Process. Syst. 35, 24824\u201324837 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"11_CR81","unstructured":"Wu, Z., et al.: 3D shapenets: a deep representation for volumetric shapes. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1912\u20131920 (2015)"},{"key":"11_CR82","unstructured":"XinhaoMei: Wavcaps (2023). https:\/\/github.com\/XinhaoMei\/WavCaps. Accessed 1 July 2023"},{"key":"11_CR83","unstructured":"Xu, H., et al: mPLUG-2: a modularized multi-modal foundation model across text, image and video. In: Proceedings of the 40th International Conference on Machine Learning, ICML 2023. JMLR.org (2023)"},{"key":"11_CR84","doi-asserted-by":"crossref","unstructured":"Xu, R., Wang, X., Wang, T., Chen, Y., Pang, J., Lin, D.: Pointllm: empowering large language models to understand point clouds (2023)","DOI":"10.1007\/978-3-031-72698-9_8"},{"key":"11_CR85","doi-asserted-by":"publisher","unstructured":"Xu, W., Chen, K., Zhao, T.: Discriminative reasoning for document-level relation extraction. In: Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021, pp. 1653\u20131663. Association for Computational Linguistics, Online (2021). https:\/\/doi.org\/10.18653\/v1\/2021.findings-acl.144. https:\/\/aclanthology.org\/2021.findings-acl.144","DOI":"10.18653\/v1\/2021.findings-acl.144"},{"key":"11_CR86","first-page":"124","volume":"35","author":"A Yang","year":"2022","unstructured":"Yang, A., Miech, A., Sivic, J., Laptev, I., Schmid, C.: Zero-shot video question answering via frozen bidirectional language models. Adv. Neural. Inf. Process. Syst. 35, 124\u2013141 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"11_CR87","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"521","DOI":"10.1007\/978-3-031-20059-5_30","volume-title":"ECCV 2022","author":"Z Yang","year":"2022","unstructured":"Yang, Z., et al.: UniTAB: unifying text and box outputs for grounded vision-language modeling. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13696, pp. 521\u2013539. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20059-5_30"},{"key":"11_CR88","doi-asserted-by":"crossref","unstructured":"Yang, Z., et al.: An empirical study of GPT-3 for few-shot knowledge-based VQA. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a036, pp. 3081\u20133089 (2022)","DOI":"10.1609\/aaai.v36i3.20215"},{"key":"11_CR89","unstructured":"Yu, L., et al.: SPAE: semantic pyramid autoencoder for multimodal generation with frozen LLMs. In: Thirty-Seventh Conference on Neural Information Processing Systems (2023). https:\/\/openreview.net\/forum?id=CXPUg86A1D"},{"key":"11_CR90","unstructured":"Yu, W., et al.: Mm-vet: evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490 (2023)"},{"key":"11_CR91","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, X., Bing, L.: Video-llama: an instruction-tuned audio-visual language model for video understanding. Empirical Methods in Natural Language Processing 2023, Demo Track (2023)","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"11_CR92","unstructured":"Zhang, R., et al.: LLaMA-adapter: efficient fine-tuning of large language models with zero-initialized attention. In: The Twelfth International Conference on Learning Representations (2024). https:\/\/openreview.net\/forum?id=d4UiXAHN2W"},{"key":"11_CR93","unstructured":"Zhao, Z., et al.: Chatbridge: bridging modalities with large language model as a language catalyst. arXiv preprint arXiv:2305.16103 (2023)"},{"key":"11_CR94","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. In: The Twelfth International Conference on Learning Representations (2024). https:\/\/openreview.net\/forum?id=1tZbq88f27"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72995-9_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T22:10:38Z","timestamp":1733091038000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72995-9_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,24]]},"ISBN":["9783031729942","9783031729959"],"references-count":94,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72995-9_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,24]]},"assertion":[{"value":"24 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}