{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T19:09:43Z","timestamp":1757617783178,"version":"3.44.0"},"reference-count":83,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2025,5,3]],"date-time":"2025-05-03T00:00:00Z","timestamp":1746230400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,5,3]],"date-time":"2025-05-03T00:00:00Z","timestamp":1746230400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"National Key Research and Development Program of China","award":["2021YFC3300200"],"award-info":[{"award-number":["2021YFC3300200"]}]},{"DOI":"10.13039\/501100001809","name":"Joint Funds of the National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["U2336211"],"award-info":[{"award-number":["U2336211"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1007\/s11263-025-02440-4","type":"journal-article","created":{"date-parts":[[2025,5,2]],"date-time":"2025-05-02T22:47:49Z","timestamp":1746226069000},"page":"5527-5543","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["ADEM-VL: Adaptive and Embedded Fusion for Efficient Vision-Language Tuning"],"prefix":"10.1007","volume":"133","author":[{"given":"Zhiwei","family":"Hao","sequence":"first","affiliation":[]},{"given":"Jianyuan","family":"Guo","sequence":"additional","affiliation":[]},{"given":"Li","family":"Shen","sequence":"additional","affiliation":[]},{"given":"Yong","family":"Luo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7532-0496","authenticated-orcid":false,"given":"Han","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Yonggang","family":"Wen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,3]]},"reference":[{"key":"2440_CR1","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F. L., Almeida, D., Altenschmidt, J., Altman, S., Anadkat, S., et al. (2023). Gpt-4 technical report. arXiv preprint arXiv:2303.08774"},{"key":"2440_CR2","first-page":"23716","volume":"35","author":"J-B Alayrac","year":"2022","unstructured":"Alayrac, J.-B., Donahue, J., Luc, P., Miech, A., Barr, I., Hasson, Y., Lenc, K., Mensch, A., Millican, K., Reynolds, M., et al. (2022). Flamingo: a visual language model for few-shot learning. Advances in Neural Information Processing Systems, 35, 23716\u201323736.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2440_CR3","unstructured":"Bai, J., Bai, S., Yang, S., Wang, S., Tan, S., Wang, P., Lin, J., Zhou, C., & Zhou, J. (2023). Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966"},{"key":"2440_CR4","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., & Zisserman, A. (2021). Frozen in time: A joint video and image encoder for end-to-end retrieval. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (pp. 1728\u20131738).","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"2440_CR5","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J. D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al. (2020). Language models are few-shot learners. Advances in Neural Information Processing Systems, 33, 1877\u20131901.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2440_CR6","unstructured":"Chen, J., Zhu, D., Shen, X., Li, X., Liu, Z., Zhang, P., Krishnamoorthi, R., Chandra, V., Xiong, Y., & Elhoseiny, M. (2023). Minigpt-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478"},{"key":"2440_CR7","unstructured":"Chen, X., Fang, H., Lin, T.-Y., Vedantam, R., Gupta, S., Doll\u00e1r, P., & Zitnick, C. L. (2015). Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325"},{"key":"2440_CR8","doi-asserted-by":"crossref","unstructured":"Chen, Y.-C., Li, L., Yu, L., El\u00a0Kholy, A., Ahmed, F., Gan, Z., Cheng, Y., & Liu, J. (2020). Uniter: Universal image-text representation learning. In European Conference on Computer Vision (pp. 104\u2013120). Springer.","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"2440_CR9","unstructured":"Cho, J., Lei, J., Tan, H., & Bansal, M. (2021). Unifying vision-and-language tasks via text generation. In International Conference on Machine Learning (pp. 1931\u20131942). PMLR."},{"key":"2440_CR10","unstructured":"Dong, X., Zhang, P., Zang, Y., Cao, Y., Wang, B., Ouyang, L., Wei, X., Zhang, S., Duan, H., Cao, M., et al. (2024). Internlm-xcomposer2: Mastering free-form text-image composition and comprehension in vision-language large model. arXiv preprint arXiv:2401.16420"},{"key":"2440_CR11","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et al. (2020). An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929"},{"key":"2440_CR12","unstructured":"Driess, D., Xia, F., Sajjadi, M.S., Lynch, C., Chowdhery, A., Ichter, B., Wahid, A., Tompson, J., Vuong, Q., Yu, T. et al. (2023). Palm-e: An embodied multimodal language model. arXiv preprint arXiv:2303.03378"},{"key":"2440_CR13","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1016\/j.neunet.2017.12.012","volume":"107","author":"S Elfwing","year":"2018","unstructured":"Elfwing, S., Uchibe, E., & Doya, K. (2018). Sigmoid-weighted linear units for neural network function approximation in reinforcement learning. Neural Networks, 107, 3\u201311.","journal-title":"Neural Networks"},{"key":"2440_CR14","unstructured":"Fu, C., Chen, P., Shen, Y., Qin, Y., Zhang, M., Lin, X., Yang, J., Zheng, X., Li, K., Sun, X., et al. (2023). Mme: A comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394"},{"key":"2440_CR15","first-page":"6616","volume":"33","author":"Z Gan","year":"2020","unstructured":"Gan, Z., Chen, Y.-C., Li, L., Zhu, C., Cheng, Y., & Liu, J. (2020). Large-scale adversarial training for vision-and-language representation learning. Advances in Neural Information Processing Systems, 33, 6616\u20136628.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2440_CR16","unstructured":"Gao, P., Han, J., Zhang, R., Lin, Z., Geng, S., Zhou, A., Zhang, W., Lu, P., He, C., Yue, X., et al. (2023). Llama-adapter v2: Parameter-efficient visual instruction model. arXiv preprint arXiv:2304.15010"},{"key":"2440_CR17","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D. & Parikh, D. (2017). Making the v in vqa matter: Elevating the role of image understanding in visual question answering. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 6904\u20136913).","DOI":"10.1109\/CVPR.2017.670"},{"key":"2440_CR18","doi-asserted-by":"crossref","unstructured":"Guo, D., Rush, A. M., & Kim, Y. (2020). Parameter-efficient transfer learning with diff pruning. arXiv preprint arXiv:2012.07463","DOI":"10.18653\/v1\/2021.acl-long.378"},{"issue":"6789","key":"2440_CR19","doi-asserted-by":"publisher","first-page":"947","DOI":"10.1038\/35016072","volume":"405","author":"RH Hahnloser","year":"2000","unstructured":"Hahnloser, R. H., Sarpeshkar, R., Mahowald, M. A., Douglas, R. J., & Seung, H. S. (2000). Digital selection and analogue amplification coexist in a cortex-inspired silicon circuit. Nature, 405(6789), 947\u2013951.","journal-title":"Nature"},{"key":"2440_CR20","doi-asserted-by":"publisher","first-page":"4262","DOI":"10.1109\/TMM.2022.3192663","volume":"24","author":"Z Hao","year":"2022","unstructured":"Hao, Z., Luo, Y., Wang, Z., Hu, H., & An, J. (2022). Cdfkd-mfs: Collaborative data-free knowledge distillation via multi-level feature sharing. IEEE Transactions on Multimedia, 24, 4262\u20134274.","journal-title":"IEEE Transactions on Multimedia"},{"key":"2440_CR21","unstructured":"He, J., Zhou, C., Ma, X., Berg-Kirkpatrick, T., & Neubig, G. (2021). Towards a unified view of parameter-efficient transfer learning. arXiv preprint arXiv:2110.04366"},{"key":"2440_CR22","unstructured":"Hendrycks, D., & Gimpel, K. (2016). Gaussian error linear units (gelus). arXiv preprint arXiv:1606.08415"},{"key":"2440_CR23","unstructured":"Houlsby, N., Giurgiu, A., Jastrzebski, S., Morrone, B., De\u00a0Laroussilhe, Q., Gesmundo, A., Attariyan, M., & Gelly, S. (2019). Parameter-efficient transfer learning for nlp. In International Conference on Machine Learning (pp. 2790\u20132799). PMLR."},{"key":"2440_CR24","unstructured":"Hu, E.J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., & Chen, W. (2021). Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685"},{"key":"2440_CR25","doi-asserted-by":"crossref","unstructured":"Hu, Z.-Y., Li, Y., Lyu, M. R., & Wang, L. (2023). Vl-pet: Vision-and-language parameter-efficient tuning via granularity control. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (pp. 3010\u20133020).","DOI":"10.1109\/ICCV51070.2023.00281"},{"key":"2440_CR26","doi-asserted-by":"crossref","unstructured":"Hudson, D. A. & Manning, C. D. (2019). Gqa: A new dataset for real-world visual reasoning and compositional question answering. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 6700\u20136709).","DOI":"10.1109\/CVPR.2019.00686"},{"key":"2440_CR27","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.-T., Parekh, Z., Pham, H., Le, Q., Sung, Y.-H., Li, Z., & Duerig, T. (2021). Scaling up visual and vision-language representation learning with noisy text supervision. In International Conference on Machine Learning (pp. 4904\u20134916). PMLR."},{"key":"2440_CR28","unstructured":"Jia, D., Guo, J., Han, K., Wu, H., Zhang, C., Xu, C., & Chen, X. (2024). Geminifusion: Efficient pixel-wise multimodal fusion for vision transformer. arXiv preprint arXiv:2406.01210"},{"key":"2440_CR29","unstructured":"Jie, S., Tang, Y., Ding, N., Deng, Z.-H., Han, K., & Wang, Y. (2024). Memory-space visual prompting for efficient vision-language fine-tuning. arXiv preprint arXiv:2405.05615"},{"key":"2440_CR30","unstructured":"Jin, Y., Xu, K., Chen, L., Liao, C., Tan, J., Chen, B., Lei, C., Liu, A., Song, C., Lei, X., et al. (2023). Unified language-vision pretraining with dynamic discrete visual tokenization. arXiv preprint arXiv:2309.04669"},{"key":"2440_CR31","first-page":"1022","volume":"34","author":"R Karimi Mahabadi","year":"2021","unstructured":"Karimi Mahabadi, R., Henderson, J., & Ruder, S. (2021). Compacter: Efficient low-rank hypercomplex adapter layers. Advances in Neural Information Processing Systems, 34, 1022\u20131035.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2440_CR32","doi-asserted-by":"crossref","unstructured":"Karpathy, A., & Fei-Fei, L. (2015). Deep visual-semantic alignments for generating image descriptions. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 3128\u20133137).","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"2440_CR33","unstructured":"Katharopoulos, A., Vyas, A., Pappas, N., & Fleuret, F. (2020). Transformers are rnns: Fast autoregressive transformers with linear attention. In International Conference on Machine Learning (pp. 5156\u20135165). PMLR."},{"key":"2440_CR34","unstructured":"Li, B., Zhang, Y., Chen, L., Wang, J., Pu, F., Yang, J., Li, C., & Liu, Z. (2023). Mimic-it: Multi-modal in-context instruction tuning. arXiv preprint arXiv:2306.05425"},{"key":"2440_CR35","doi-asserted-by":"crossref","unstructured":"Li, L., Chen, Y.-C., Cheng, Y., Gan, Z., Yu, L., & Liu, J. (2020). Hero: Hierarchical encoder for video+ language omni-representation pre-training. arXiv preprint arXiv:2005.00200","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"2440_CR36","unstructured":"Li, J., Li, D., Savarese, S., & Hoi, S. (2023). Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International Conference on Machine Learning (pp. 19730\u201319742). PMLR."},{"key":"2440_CR37","unstructured":"Li, J., Li, D., Xiong, C., & Hoi, S. (2022). Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning (pp. 12888\u201312900). PMLR."},{"key":"2440_CR38","doi-asserted-by":"crossref","unstructured":"Li, X., Yin, X., Li, C., Zhang, P., Hu, X., Zhang, L., Wang, L., Hu, H., Dong, L., Wei, F., et al. (2020). Oscar: Object-semantics aligned pre-training for vision-language tasks. In Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXX 16 (pp. 121\u2013137). Springer.","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"2440_CR39","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., & Belongie, S. (2017). Feature pyramid networks for object detection. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 2117\u20132125).","DOI":"10.1109\/CVPR.2017.106"},{"key":"2440_CR40","unstructured":"Lin, B., Tang, Z., Ye, Y., Cui, J., Zhu, B., Jin, P., Zhang, J., Ning, M., & Yuan, L. (2024). Moe-llava: Mixture of experts for large vision-language models. arXiv preprint arXiv:2401.15947"},{"key":"2440_CR41","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., & Hoi, S. (2021). Align before fuse: Vision and language representation learning with momentum distillation. Advances in Neural Information Processing Systems, 34, 9694\u20139705.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2440_CR42","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., & Lee, Y.J. (2024). Improved baselines with visual instruction tuning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 26296\u201326306).","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"2440_CR43","unstructured":"Liu, H., Li, C., Wu, Q., & Lee, Y. J. (2024). Visual instruction tuning. Advances in neural information processing systems 36."},{"key":"2440_CR44","first-page":"1950","volume":"35","author":"H Liu","year":"2022","unstructured":"Liu, H., Tam, D., Muqeeth, M., Mohta, J., Huang, T., Bansal, M., & Raffel, C. A. (2022). Few-shot parameter-efficient fine-tuning is better and cheaper than in-context learning. Advances in Neural Information Processing Systems, 35, 1950\u20131965.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2440_CR45","doi-asserted-by":"crossref","unstructured":"Liu, Y., Duan, H., Zhang, Y., Li, B., Zhang, S., Zhao, W., Yuan, Y., Wang, J., He, C., Liu, Z. et al. (2023). Mmbench: Is your multi-modal model an all-around player? arXiv preprint arXiv:2307.06281","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"2440_CR46","unstructured":"Lu, J., Batra, D., Parikh, D., & Lee, S. (2019). Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in Neural Information Processing Systems32."},{"key":"2440_CR47","unstructured":"Lu, J., Clark, C., Zellers, R., Mottaghi, R., & Kembhavi, A. (2022). Unified-io: A unified model for vision, language, and multi-modal tasks. In The Eleventh International Conference on Learning Representations."},{"key":"2440_CR48","first-page":"2507","volume":"35","author":"P Lu","year":"2022","unstructured":"Lu, P., Mishra, S., Xia, T., Qiu, L., Chang, K.-W., Zhu, S.-C., Tafjord, O., Clark, P., & Kalyan, A. (2022). Learn to explain: Multimodal reasoning via thought chains for science question answering. Advances in Neural Information Processing Systems, 35, 2507\u20132521.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2440_CR49","unstructured":"Luo, G., Zhou, Y., Ren, T., Chen, S., Sun, X. & Ji, R. (2024). Cheap and quick: Efficient vision-language instruction tuning for large language models. Advances in Neural Information Processing Systems36."},{"key":"2440_CR50","doi-asserted-by":"crossref","unstructured":"Maaz, M., Rasheed, H., Khan, S. & Khan, F.S. (2023). Video-chatgpt: Towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"2440_CR51","doi-asserted-by":"crossref","unstructured":"McKinzie, B., Gan, Z., Fauconnier, J.-P., Dodge, S., Zhang, B., Dufter, P., Shah, D., Du, X., Peng, F., Weers, F., et al. (2024). Mm1: Methods, analysis & insights from multimodal llm pre-training. arXiv preprint arXiv:2403.09611","DOI":"10.1007\/978-3-031-73397-0_18"},{"key":"2440_CR52","doi-asserted-by":"crossref","unstructured":"Miech, A., Alayrac, J.-B., Smaira, L., Laptev, I., Sivic, J., & Zisserman, A. (2020). End-to-end learning of visual representations from uncurated instructional videos. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 9879\u20139889).","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"2440_CR53","unstructured":"Mokady, R., Hertz, A., & Bermano, A. H. (2021). Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734"},{"key":"2440_CR54","unstructured":"Mu, Y., Zhang, Q., Hu, M., Wang, W., Ding, M., Jin, J., Wang, B., Dai, J., Qiao, Y., & Luo, P. (2024). Embodiedgpt: Vision-language pre-training via embodied chain of thought. Advances in Neural Information Processing Systems36."},{"key":"2440_CR55","unstructured":"Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al. (2021). Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning (pp. 8748\u20138763). PMLR."},{"key":"2440_CR56","doi-asserted-by":"crossref","unstructured":"Singh, A., Hu, R., Goswami, V., Couairon, G., Galuba, W., Rohrbach, M., & Kiela, D. (2022). Flava: A foundational language and vision alignment model. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 15638\u201315650).","DOI":"10.1109\/CVPR52688.2022.01519"},{"key":"2440_CR57","unstructured":"Su, W., Zhu, X., Cao, Y., Li, B., Lu, L., Wei, F., & Dai, J. (2019). Vl-bert: Pre-training of generic visual-linguistic representations. arXiv preprint arXiv:1908.08530"},{"key":"2440_CR58","doi-asserted-by":"crossref","unstructured":"Sung, Y.-L., Cho, J., & Bansal, M. (2022). Vl-adapter: Parameter-efficient transfer learning for vision-and-language tasks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 5227\u20135237).","DOI":"10.1109\/CVPR52688.2022.00516"},{"key":"2440_CR59","first-page":"12991","volume":"35","author":"Y-L Sung","year":"2022","unstructured":"Sung, Y.-L., Cho, J., & Bansal, M. (2022). Lst: Ladder side-tuning for parameter and memory efficient transfer learning. Advances in Neural Information Processing Systems, 35, 12991\u201313005.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2440_CR60","first-page":"24193","volume":"34","author":"Y-L Sung","year":"2021","unstructured":"Sung, Y.-L., Nair, V., & Raffel, C. A. (2021). Training neural networks with fixed sparse masks. Advances in Neural Information Processing Systems, 34, 24193\u201324205.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2440_CR61","doi-asserted-by":"crossref","unstructured":"Tan, H., & Bansal, M. (2019). Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490","DOI":"10.18653\/v1\/D19-1514"},{"key":"2440_CR62","unstructured":"Taori, R., Gulrajani, I., Zhang, T., Dubois, Y., Li, X., Guestrin, C., Liang, P., & Hashimoto, T. B. (2023). Stanford Alpaca: An Instruction-following LLaMA model. GitHub."},{"key":"2440_CR63","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.-A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., Azhar, F., et al. (2023). Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971"},{"key":"2440_CR64","doi-asserted-by":"crossref","unstructured":"Tsai, Y.-H.H., Bai, S., Yamada, M., Morency, L.-P., & Salakhutdinov, R. (2019). Transformer dissection: a unified understanding of transformer\u2019s attention via the lens of kernel. arXiv preprint arXiv:1908.11775","DOI":"10.18653\/v1\/D19-1443"},{"key":"2440_CR65","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., & Polosukhin, I. (2017). Attention is all you need. Advances in Neural Information Processing Systems30."},{"key":"2440_CR66","unstructured":"Wadekar, S. N., Chaurasia, A., Chadha, A., & Culurciello, E. (2024). The evolution of multimodal model architectures. arXiv preprint arXiv:2405.17927"},{"key":"2440_CR67","unstructured":"Wang, W., Chen, Z., Chen, X., Wu, J., Zhu, X., Zeng, G., Luo, P., Lu, T., Zhou, J., Qiao, Y., et al. (2024). Visionllm: Large language model is also an open-ended decoder for vision-centric tasks. Advances in Neural Information Processing Systems36."},{"key":"2440_CR68","unstructured":"Wang, W., Lv, Q., Yu, W., Hong, W., Qi, J., Wang, Y., Ji, J., Yang, Z., Zhao, L., Song, X., et al. (2023). Cogvlm: Visual expert for pretrained language models. arXiv preprint arXiv:2311.03079"},{"key":"2440_CR69","unstructured":"Xu, R., Yao, Y., Guo, Z., Cui, J., Ni, Z., Ge, C., Chua, T.-S., Liu, Z., Sun, M., & Huang, G. (2024). Llava-uhd: An lmm perceiving any aspect ratio and high-resolution images. arXiv preprint arXiv:2403.11703"},{"key":"2440_CR70","unstructured":"Ye, Q., Xu, H., Xu, G., Ye, J., Yan, M., Zhou, Y., Wang, J., Hu, A., Shi, P., Shi, Y., et al. (2023). mplug-owl: Modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178"},{"key":"2440_CR71","doi-asserted-by":"crossref","unstructured":"Ye, Q., Xu, H., Ye, J., Yan, M., Hu, A., Liu, H., Qian, Q., Zhang, J., & Huang, F. (2024). mplug-owl2: Revolutionizing multi-modal large language model with modality collaboration. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 13040\u201313051).","DOI":"10.1109\/CVPR52733.2024.01239"},{"key":"2440_CR72","unstructured":"Yuan, L., Chen, D., Chen, Y.-L., Codella, N., Dai, X., Gao, J., Hu, H., Huang, X., Li, B., Li, C., et al. (2021). Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432"},{"key":"2440_CR73","unstructured":"Yuan, L., Chen, D., Chen, Y.-L., Codella, N., Dai, X., Gao, J., Hu, H., Huang, X., Li, B., Li, C., et al. (2021). Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432"},{"key":"2440_CR74","doi-asserted-by":"crossref","unstructured":"Yue, X., Ni, Y., Zhang, K., Zheng, T., Liu, R., Zhang, G., Stevens, S., Jiang, D., Ren, W., Sun, Y., et al. (2024). Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 9556\u20139567).","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"2440_CR75","unstructured":"Zaken, E.B., Ravfogel, S., & Goldberg, Y. (2021). Bitfit: Simple parameter-efficient fine-tuning for transformer-based masked language-models. arXiv preprint arXiv:2106.10199"},{"key":"2440_CR76","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Fan, H., & Yang, Y. (2024). Prompt-aware adapter: Towards learning adaptive visual tokens for multimodal large language models. arXiv preprint arXiv:2405.15684","DOI":"10.1109\/TAI.2025.3596925"},{"key":"2440_CR77","unstructured":"Zhang, R., Han, J., Liu, C., Gao, P., Zhou, A., Hu, X., Yan, S., Lu, P., Li, H., & Qiao, Y. (2023). Llama-adapter: Efficient fine-tuning of language models with zero-init attention. arXiv preprint arXiv:2303.16199"},{"key":"2440_CR78","doi-asserted-by":"crossref","unstructured":"Zhang, H., Shao, W., Liu, H., Ma, Y., Luo, P., Qiao, Y., & Zhang, K. (2024). Avibench: Towards evaluating the robustness of large vision-language model on adversarial visual-instructions. arXiv preprint arXiv:2403.09346","DOI":"10.1109\/TIFS.2024.3520306"},{"key":"2440_CR79","doi-asserted-by":"crossref","unstructured":"Zhang, H., Xu, L., Lai, S., Shao, W., Zheng, N., Luo, P., Qiao, Y., & Zhang, K. (2024). Open-vocabulary animal keypoint detection with semantic-feature matching. International Journal of Computer Vision 1\u201318.","DOI":"10.1007\/s11263-024-02126-3"},{"key":"2440_CR80","unstructured":"Zhang, Z., Zhang, A., Li, M., Zhao, H., Karypis, G., & Smola, A. (2023). Multimodal chain-of-thought reasoning in language models. arXiv preprint arXiv:2302.00923"},{"key":"2440_CR81","unstructured":"Zhao, B., Tu, H., Wei, C., Mei, J., & Xie, C. (2023). Tuning layernorm in attention: Towards efficient multi-modal llm finetuning. arXiv preprint arXiv:2312.11420"},{"key":"2440_CR82","doi-asserted-by":"crossref","unstructured":"Zhu, L., & Yang, Y. (2020). Actbert: Learning global-local video-text representations. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 8746\u20138755).","DOI":"10.1109\/CVPR42600.2020.00877"},{"key":"2440_CR83","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., & Elhoseiny, M. (2023). Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02440-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02440-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02440-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T12:44:24Z","timestamp":1757162664000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02440-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,3]]},"references-count":83,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2025,8]]}},"alternative-id":["2440"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02440-4","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"type":"print","value":"0920-5691"},{"type":"electronic","value":"1573-1405"}],"subject":[],"published":{"date-parts":[[2025,5,3]]},"assertion":[{"value":"30 July 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 March 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 May 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no conflict of interest to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"All authors have reviewed and approved the manuscript for publication.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"Not applicable.","order":5,"name":"Ethics","group":{"name":"EthicsHeading","label":"Materials availability"}}]}}