{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T10:11:43Z","timestamp":1773137503302,"version":"3.50.1"},"reference-count":228,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Shenzhen Science and Technology Program","award":["JCYJ20241202130548062"],"award-info":[{"award-number":["JCYJ20241202130548062"]}]},{"name":"Natural Science Foundation of Shenzhen","award":["JCYJ20230807142703006"],"award-info":[{"award-number":["JCYJ20230807142703006"]}]},{"name":"Key Research Platforms, Projects of the Guangdong Provincial Department of Education","award":["2023ZDZX1034"],"award-info":[{"award-number":["2023ZDZX1034"]}]},{"name":"Opening Project of the State Key Laboratory of General Artificial Intelligence, BIGAI\/Peking University, Beijing, China","award":["SKLAGI2025OP03"],"award-info":[{"award-number":["SKLAGI2025OP03"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Multimedia"],"published-print":{"date-parts":[[2026]]},"DOI":"10.1109\/tmm.2025.3632653","type":"journal-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:50:18Z","timestamp":1763146218000},"page":"1188-1210","source":"Crossref","is-referenced-by-count":0,"title":["How Vision-Language Tasks Benefit From Large Pre-Trained Models: A Survey"],"prefix":"10.1109","volume":"28","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-0021-9475","authenticated-orcid":false,"given":"Yayun","family":"Qi","sequence":"first","affiliation":[{"name":"Beijing Key Laboratory of Intelligent Information Technology, School of Computer Science &amp; Technology, Beijing Institute of Technology, Beijing, China"}]},{"given":"Hongxi","family":"Li","sequence":"additional","affiliation":[{"name":"Beijing Key Laboratory of Intelligent Information Technology, School of Computer Science &amp; Technology, Beijing Institute of Technology, Beijing, China"}]},{"given":"Yiqi","family":"Song","sequence":"additional","affiliation":[{"name":"Beijing Key Laboratory of Intelligent Information Technology, School of Computer Science &amp; Technology, Beijing Institute of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2056-6947","authenticated-orcid":false,"given":"Xinxiao","family":"Wu","sequence":"additional","affiliation":[{"name":"Beijing Key Laboratory of Intelligent Information Technology, School of Computer Science &amp; Technology, Beijing Institute of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4516-9729","authenticated-orcid":false,"given":"Jiebo","family":"Luo","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Rochester, Rochester, NY, USA"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2021.07.009"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00090"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01742"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.501"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00359"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"ref9","first-page":"1","article-title":"Visual classification via description from large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Menon","year":"2022"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00287"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.351"},{"key":"ref12","first-page":"69706","article-title":"ChatGPT-powered hierarchical comparisons for image classification","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Ren","year":"2024"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1561\/0600000105"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298966"},{"key":"ref16","first-page":"1462","article-title":"Draw: A recurrent neural network for image generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Gregor","year":"2015"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00854"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00210"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00165"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6627"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2023.3238524"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00611"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00688"},{"key":"ref24","article-title":"Visual entailment: A novel task for fine-grained image understanding","author":"Xie","year":"2019"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_32"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00247"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01258"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3560815"},{"key":"ref29","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref30","article-title":"Vicuna: An open-source chatbot impressing GPT-4 with 90 ChatGPT quality","author":"Chiang","year":"2023"},{"key":"ref31","article-title":"Qwen technical report","author":"Bai","year":"2023"},{"key":"ref32","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2021"},{"key":"ref33","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Jia","year":"2021"},{"key":"ref34","first-page":"32897","article-title":"Vlmo: Unified vision-language pre-training with mixture-of-modality-experts","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Bao","year":"2022"},{"key":"ref35","first-page":"1","article-title":"Filip: Fine-grained interactive language-image pre-training","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Yao","year":"2022"},{"key":"ref36","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li","year":"2022"},{"key":"ref37","first-page":"1","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zhu","year":"2024"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"ref40","article-title":"A survey of large language models","author":"Zhao","year":"2023"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3744746"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3641289"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022-1410-8"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3506283"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1093\/nsr\/nwae403"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.738"},{"key":"ref47","article-title":"Exploring the reasoning abilities of multimodal large language models (MLLMs): A comprehensive survey on emerging trends in multimodal reasoning","author":"Wang","year":"2024"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/tcsvt.2025.3566695"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3369699"},{"key":"ref50","article-title":"LLMs meet multimodal generation and editing: A survey","author":"He","year":"2024"},{"key":"ref51","first-page":"1","article-title":"Generalized out-of-distribution detection and beyond in vision language model era: A survey","volume":"2025","author":"Miyai","year":"2025","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref53","article-title":"Ernie: Enhanced representation through knowledge integration","author":"Sun","year":"2019"},{"key":"ref54","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Brown","year":"2020"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1208"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2954741"},{"key":"ref57","first-page":"3063","article-title":"Weakly supervised dense event captioning in videos","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Duan","year":"2018"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.517"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00425"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00751"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01739"},{"key":"ref62","article-title":"Language models can see: Plugging visual controls in text generation","author":"Su","year":"2022"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02247"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01337"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612179"},{"key":"ref66","article-title":"Zero-shot video captioning with evolving pseudo-tokens","author":"Tewel","year":"2022"},{"key":"ref67","article-title":"Zero-shot dense video captioning by jointly optimizing text and moment","author":"Jo","year":"2023"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.421"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547935"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20215"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-emnlp.67"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.52202\/068431-0617"},{"key":"ref73","article-title":"Language as the medium: Multimodal video classification through text only","author":"Hanu","year":"2023"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.608"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00035"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-emnlp.299"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.00252"},{"key":"ref78","first-page":"1","article-title":"Decap: Decoding clip latents for zero-shot captioning via text-only training","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Li","year":"2023"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/481"},{"key":"ref80","article-title":"Zero-shot image captioning by anchor-augmented vision-language space alignment","author":"Wang","year":"2022"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00291"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00257"},{"key":"ref83","article-title":"Clip-gen: Language-free training of a text-to-image generator with clip","author":"Wang","year":"2022"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28203"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i4.28178"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0819"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01738"},{"key":"ref88","article-title":"See, think, confirm: Interactive prompting between vision and language models for knowledge-based visual reasoning","author":"Chen","year":"2023"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.640"},{"key":"ref90","article-title":"Domino: A dual-system for multi-step visual language reasoning","author":"Wang","year":"2023"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.755"},{"key":"ref92","article-title":"Good questions help zero-shot image reasoning","author":"Yang","year":"2023"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.255"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.189"},{"key":"ref95","article-title":"Multimodal multi-hop question answering through a conversation between tools and efficiently finetuned large language models","author":"Rajabzadeh","year":"2023"},{"key":"ref96","first-page":"2507","article-title":"Learn to explain: Multimodal reasoning via thought chains for science question answering","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Lu","year":"2022"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.734"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29884"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29844"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.11"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.465"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01367"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-78456-9_15"},{"key":"ref104","article-title":"Chain of images for intuitively reasoning","author":"Meng","year":"2023"},{"key":"ref105","article-title":"Visual chain of thought: Bridging logical gaps with multimodal infillings","author":"Rose","year":"2023"},{"key":"ref106","first-page":"204","article-title":"Lets think frame by frame: Evaluating video chain of thought with video infilling and prediction","volume-title":"Proc. Conf. Empirical Methods Nat. Lang. Process.","author":"Himakunthala","year":"2023"},{"key":"ref107","first-page":"50105","article-title":"Zero-shot visual relation detection via composite visual cues from large language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Li","year":"2024"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680690"},{"key":"ref109","article-title":"Videoprompter: An ensemble of foundational models for zero-shot video understanding","author":"Yousaf","year":"2023"},{"key":"ref110","first-page":"1","article-title":"Open-vocabulary object detection via vision and language knowledge distillation","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Gu","year":"2022"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01464"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01076"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20080-9_16"},{"key":"ref114","first-page":"1","article-title":"Open-vocabulary object detection upon frozen vision and language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kuo","year":"2023"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548021"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3524181"},{"key":"ref117","first-page":"46491","article-title":"LMC: Large model collaboration with cross-assessment for training-free open-set object recognition","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Qu","year":"2024"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00276"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1844"},{"key":"ref120","article-title":"Beyond anti-forgetting: Multimodal continual instruction tuning with positive forward transfer","author":"Zheng","year":"2024"},{"key":"ref121","article-title":"Mm-react: Prompting ChatGPT for multimodal reasoning and action","author":"Yang","year":"2023"},{"key":"ref122","article-title":"Visual ChatGPT: Talking, drawing and editing with visual foundation models","author":"Wu","year":"2023"},{"key":"ref123","first-page":"43447","article-title":"Chameleon: Plug-and-play compositional reasoning with large language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Lu","year":"2023"},{"key":"ref124","article-title":"Mm-vid: Advancing video understanding with GPT-4 v(ision)","author":"Lin","year":"2023"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01436"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01092"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72920-1_18"},{"key":"ref128","article-title":"Towards truly zero-shot compositional visual reasoning with LLMs as programmers","author":"Stanic","year":"2024"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.215"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1109\/iccv.2015.303"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00468"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73232-4_3"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01512"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46478-7_28"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21346"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240563"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1145\/3698590"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01747"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01237-3_28"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.500"},{"key":"ref144","article-title":"Incorporating external knowledge to answer open-domain visual questions with dynamic memory networks","author":"Li","year":"2017"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2754246"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-88361-4_9"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1145\/219717.219748"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1145\/2629489"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00147"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01074"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01045"},{"key":"ref152","first-page":"1","article-title":"Finetuned language models are zero-shot learners","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Wei","year":"2022"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1145\/3777411"},{"key":"ref154","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Ouyang","year":"2022"},{"key":"ref155","first-page":"4299","article-title":"Deep reinforcement learning from human preferences","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Christiano","year":"2017"},{"key":"ref156","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Li","year":"2021"},{"key":"ref157","first-page":"26070","article-title":"Modeling caption diversity in contrastive vision-language pretraining","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Lavoie","year":"2024"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"ref159","article-title":"Actionclip: A new paradigm for video action recognition","author":"Wang","year":"2021"},{"key":"ref160","first-page":"19730","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li","year":"2023"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.775"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00854"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"ref167","first-page":"34892","article-title":"Visual instruction tuning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Liu","year":"2024"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4321-9"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2025.3571946"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2025.3571946"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00904"},{"key":"ref172","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00331"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00380"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"ref175","article-title":"Seed-bench: Benchmarking multimodal LLMs with generative comprehension","author":"Li","year":"2023"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00294"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00914"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-short.43"},{"key":"ref180","first-page":"46433","article-title":"Cola: A benchmark for compositional text-to-image retrieval","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Ray","year":"2024"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01050"},{"key":"ref182","first-page":"1","article-title":"When and why vision-language models behave like bags-of-words, and what to do about it?","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Yuksekgonul","year":"2022"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1355"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01363"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.370"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01271"},{"key":"ref187","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02064"},{"issue":"8","key":"ref188","first-page":"1","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.681"},{"issue":"1","key":"ref190","first-page":"5485","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00453"},{"key":"ref192","article-title":"Clipcap: Clip prefix for image captioning","author":"Mokady","year":"2021"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/128"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28251"},{"key":"ref196","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref197","first-page":"65","article-title":"Meteor: An automatic metric for MT evaluation with improved correlation with human judgments","volume-title":"Proc. Workshop Annu. Meeting Assoc. Comput. Linguistics","author":"Banerjee","year":"2005"},{"key":"ref198","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1800"},{"key":"ref201","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20074-8_9"},{"key":"ref202","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00550"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3361157"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref206","article-title":"The Caltech-UCSD birds-200-2011 dataset","author":"Wah","year":"2011","journal-title":"California Inst. Technol."},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2723009"},{"key":"ref209","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"ref210","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.461"},{"key":"ref211","article-title":"An image is worth 16  16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"ref212","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Li","year":"2021"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_24"},{"key":"ref214","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref215","article-title":"Evaluating large language models trained on code","author":"Chen","year":"2021"},{"key":"ref216","article-title":"Hallusionbench: You see what you think? Or you think what you see? An image-context reasoning benchmark challenging for GPT-4v (ision), llava-1.5, and other multi-modality models","author":"Liu","year":"2023"},{"key":"ref217","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.eacl-main.156"},{"key":"ref218","doi-asserted-by":"publisher","DOI":"10.1162\/coli.a.16"},{"key":"ref219","first-page":"59670","article-title":"How language model hallucinations can snowball","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Zhang","year":"2024"},{"key":"ref220","doi-asserted-by":"publisher","DOI":"10.1145\/3735633"},{"key":"ref221","first-page":"1","article-title":"Visrag: Vision-based retrieval-augmented generation on multi-modality documents","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Yu","year":"2025"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-naacl.108"},{"key":"ref223","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00188"},{"key":"ref224","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.14"},{"key":"ref225","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00517"},{"key":"ref226","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.886"},{"key":"ref227","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19775-8_20"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.567"}],"container-title":["IEEE Transactions on Multimedia"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6046\/11342315\/11248964.pdf?arnumber=11248964","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,9]],"date-time":"2026-03-09T19:59:27Z","timestamp":1773086367000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11248964\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":228,"URL":"https:\/\/doi.org\/10.1109\/tmm.2025.3632653","relation":{},"ISSN":["1520-9210","1941-0077"],"issn-type":[{"value":"1520-9210","type":"print"},{"value":"1941-0077","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]}}}