{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T15:59:43Z","timestamp":1776095983683,"version":"3.50.1"},"reference-count":283,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"10","license":[{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62125206"],"award-info":[{"award-number":["62125206"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Key Research and Development Program of China","award":["2022YFB4500100"],"award-info":[{"award-number":["2022YFB4500100"]}]},{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["LD24F020014"],"award-info":[{"award-number":["LD24F020014"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1109\/tpami.2025.3576835","type":"journal-article","created":{"date-parts":[[2025,6,6]],"date-time":"2025-06-06T13:41:51Z","timestamp":1749217311000},"page":"8415-8434","source":"Crossref","is-referenced-by-count":3,"title":["The Synergy Between Data and Multi-Modal Large Language Models: A Survey From Co-Development Perspective"],"prefix":"10.1109","volume":"47","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1756-6102","authenticated-orcid":false,"given":"Zhen","family":"Qin","sequence":"first","affiliation":[{"name":"College of Computer Science and Technology, Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8015-2121","authenticated-orcid":false,"given":"Daoyuan","family":"Chen","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"given":"Wenhao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3828-796X","authenticated-orcid":false,"given":"Liuyi","family":"Yao","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"given":"Yilun","family":"Huang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1535-9692","authenticated-orcid":false,"given":"Bolin","family":"Ding","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4204-6096","authenticated-orcid":false,"given":"Yaliang","family":"Li","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5015-6095","authenticated-orcid":false,"given":"Shuiguang","family":"Deng","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, Zhejiang University, Hangzhou, China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","author":"Reid","year":"2024"},{"key":"ref2","article-title":"SoRA: Creating video from text","year":"2024"},{"key":"ref3","article-title":"Hello GPT-4o","year":"2024"},{"key":"ref4","first-page":"53366","article-title":"NExT-GPT: Any-to-any multimodal LLM","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wu"},{"key":"ref5","article-title":"Emu3: Next-token prediction is all you need","author":"Wang","year":"2024"},{"key":"ref6","article-title":"A survey of large language models","author":"Zhao","year":"2023"},{"key":"ref7","article-title":"Will we run out of data? An analysis of the limits of scaling datasets in machine learning","author":"Villalobos","year":"2022"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02142"},{"key":"ref9","article-title":"Lumina-T2X: Transforming text into any modality, resolution, and duration via flow-based large diffusion transformers","author":"Gao","year":"2024"},{"key":"ref10","article-title":"Scaling laws for neural language models","author":"Kaplan","year":"2020"},{"key":"ref11","first-page":"265","article-title":"Scaling laws for generative mixed-modal language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Aghajanyan"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/3711118"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TAI.2023.3315272"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/s12599-024-00857-8"},{"key":"ref15","first-page":"27092","article-title":"DataComp: In search of the next generation of multimodal datasets","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Gadre"},{"key":"ref16","first-page":"35544","article-title":"Improving CLIP training with language rewrites","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Fan"},{"key":"ref17","article-title":"Efficient multimodal learning from data-centric perspective","author":"He","year":"2024"},{"key":"ref18","article-title":"A review of multi-modal large language and vision models","author":"Carolan","year":"2024"},{"key":"ref19","article-title":"The (R)evolution of multimodal large language models: A survey","author":"Caffagni","year":"2024"},{"key":"ref20","article-title":"A survey of resource-efficient LLM and multimodal foundation models","author":"Xu","year":"2024"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1561\/0600000110"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.738"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3369699"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671473"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/tcsvt.2025.3566695"},{"key":"ref26","article-title":"Efficient multimodal large language models: A survey","author":"Jin","year":"2024"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-025-02572-7"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1093\/nsr\/nwae403"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/BigData59044.2023.10386743"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3275156"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3649447"},{"key":"ref32","article-title":"Exploring the reasoning abilities of multimodal large language models (MLLMS): A comprehensive survey on emerging trends in multimodal reasoning","author":"Wang","year":"2024"},{"key":"ref33","doi-asserted-by":"crossref","DOI":"10.36227\/techrxiv.171172801.19993069\/v1","article-title":"A survey on generative AI and LLM for video generation, understanding, and streaming","author":"Zhou","year":"2024"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671462"},{"key":"ref35","article-title":"A survey of multimodal large language model from a data-centric perspective","author":"Bai","year":"2024"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01393"},{"key":"ref38","article-title":"Allava: Harnessing gpt4v-synthesized data for a lite vision-language model","author":"Chen","year":"2024"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref40","article-title":"Self-alignment with instruction backtranslation","author":"Li","year":"2023"},{"key":"ref41","article-title":"Data management for large language models: A survey","author":"Wang","year":"2023"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.658"},{"key":"ref43","article-title":"A survey on data selection for language models","author":"Albalak","year":"2024"},{"key":"ref44","article-title":"A survey on data selection for LLM instruction tuning","author":"Wang","year":"2024"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.97"},{"key":"ref46","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref47","article-title":"An image is worth 16 \u00d7 16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy"},{"key":"ref48","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref49","first-page":"34892","article-title":"Visual instruction tuning","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Liu"},{"key":"ref50","article-title":"LLaMA-adapter V2: Parameter-efficient visual instruction model","author":"Gao","year":"2023"},{"key":"ref51","article-title":"Training compute-optimal large language models","author":"Hoffmann","year":"2022"},{"key":"ref52","article-title":"No \u201czero-sho","author":"Udandarao","year":"2024"},{"key":"ref53","article-title":"ChartThinker: A contextual chain-of-thought approach to optimized chart summarization","author":"Liu","year":"2024"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676408"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i21.30383"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2023-1136"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-70533-5_26"},{"key":"ref60","article-title":"List items one by one: A new data source and learning paradigm for multimodal LLMs","author":"Yan","year":"2024"},{"key":"ref61","article-title":"TextSquare: Scaling up text-centric visual instruction tuning","author":"Tang","year":"2024"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.20"},{"key":"ref63","article-title":"TextHawk: Exploring efficient fine-grained perception of multimodal large language models","author":"Yu","year":"2024"},{"key":"ref64","article-title":"BuboGPT: Enabling visual grounding in multi-modal LLMs","author":"Zhao","year":"2023"},{"key":"ref65","first-page":"2507","article-title":"Learn to explain: Multimodal reasoning via thought chains for science question answering","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Lu"},{"key":"ref66","article-title":"Hunyuan-DiT: A powerful multi-resolution diffusion transformer with fine-grained chinese understanding","author":"Li","year":"2024"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"ref68","article-title":"Structchart: Perception, structuring, reasoning for visual chart understanding","author":"Xia","year":"2023"},{"key":"ref69","article-title":"Deep learning and LLM-based methods applied to stellar lightcurve classification","author":"Li","year":"2024"},{"key":"ref70","article-title":"What makes for good visual instructions? Synthesizing complex visual reasoning instructions for visual instruction tuning","author":"Du","year":"2023"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/ICMEW63481.2024.10645462"},{"key":"ref72","article-title":"Videochat: Chat-centric video understanding","author":"Li","year":"2023"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681167"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.521"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.855"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680649"},{"key":"ref77","article-title":"Multimodal large language model is a human-aligned annotator for text-to-image generation","author":"Wu","year":"2024"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.70"},{"key":"ref79","first-page":"25081","article-title":"EmbodiedGPT: Vision-language pre-training via embodied chain of thought","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Mu"},{"key":"ref80","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zhu"},{"key":"ref81","article-title":"MiniCPM-V: A GPT-4V level MLLM on your phone","author":"Yao","year":"2024"},{"key":"ref82","article-title":"Descriptive caption enhancement with visual specialists for multimodal perception","author":"Sun","year":"2024"},{"key":"ref83","article-title":"Inst-IT: Boosting multimodal instance understanding via explicit visual prompt instruction tuning","author":"Peng","year":"2024"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72775-7_2"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.743"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.265"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/lra.2025.3608656"},{"key":"ref88","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Jia"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73337-6_8"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01239"},{"key":"ref91","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref92","article-title":"mPLUG-DocOwl: Modularized multimodal large language model for document understanding","author":"Ye","year":"2023"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29727"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-92089-9_23"},{"key":"ref95","article-title":"Data augmentation for text-based person retrieval using large language models","author":"Li","year":"2024"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/FG59268.2024.10581994"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01330"},{"key":"ref98","first-page":"71995","article-title":"GPT4Tools: Teaching large language model to use tools via self-instruction","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Yang"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680808"},{"key":"ref100","first-page":"23716","article-title":"Flamingo: A visual language model for few-shot learning","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Alayrac"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.1009"},{"key":"ref102","article-title":"SPHINX-X: Scaling data and parameters for a family of multi-modal large language models","author":"Gao","year":"2024"},{"key":"ref103","first-page":"87310","article-title":"Cambrian-1: A fully open, vision-centric exploration of multimodal LLMs","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Tong"},{"key":"ref104","first-page":"8469","article-title":"PaLM-E: An embodied multimodal language model","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Driess"},{"key":"ref105","article-title":"Retrieval-augmented multi-modal chain-of-thoughts reasoning for large language models","author":"Liu","year":"2023"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1145\/3626246.3653385"},{"key":"ref107","first-page":"55006","article-title":"LIMA: Less is more for alignment","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Zhou"},{"key":"ref108","article-title":"Towards a statistical theory of data selection under weak supervision","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kolossov"},{"key":"ref109","first-page":"19523","article-title":"Beyond neural scaling laws: Beating power law scaling via data pruning","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Sorscher"},{"key":"ref110","article-title":"On the de-duplication of LAION-2B","author":"Webster","year":"2023"},{"key":"ref111","article-title":"A survey on locality sensitive hashing algorithms and their applications","author":"Jafari","year":"2021"},{"key":"ref112","article-title":"Clip retrieval: Easily compute clip embeddings and build a clip retrieval system with them","author":"Beaumont","year":"2022"},{"key":"ref113","article-title":"Lossy image compression with compressive autoencoders","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Theis"},{"key":"ref114","article-title":"Multimodal data curation via object detection and filter ensembles","author":"Huang","year":"2024"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00673"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02116"},{"key":"ref117","article-title":"T-MARS: Improving visual representations by circumventing text feature learning","author":"Maini","year":"2023"},{"key":"ref118","first-page":"22047","article-title":"Improving multimodal datasets with image captioning","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Nguyen"},{"key":"ref119","article-title":"The devil is in the details: A deep dive into the rabbit hole of data filtering","author":"Yu","year":"2023"},{"key":"ref120","article-title":"Finetuned multimodal language models are high-quality image-text data filters","author":"Wang","year":"2024"},{"key":"ref121","article-title":"FM2DS: Few-shot multimodal multihop data synthesis with knowledge distillation for question answering","author":"Abaskohi","year":"2024"},{"key":"ref122","article-title":"Data filtering networks","author":"Fang","year":"2023"},{"key":"ref123","article-title":"InstructionGPT-4: A 200-instruction paradigm for fine-tuning miniGPT-4","author":"Wei","year":"2023"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.5040\/9798881817916.ch-004"},{"key":"ref125","article-title":"Text-centric alignment for multi-modality learning","author":"Tsai","year":"2024"},{"key":"ref126","first-page":"21455","article-title":"Quality not quantity: On the interaction between dataset design and robustness of clip","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Nguyen"},{"key":"ref127","article-title":"A decade\u2019s battle on dataset bias: Are we there yet?","author":"Liu","year":"2024"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02489"},{"key":"ref129","first-page":"2252","article-title":"Patch n\u2019pack: NaViT, a vision transformer for any aspect ratio and resolution","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Dehghani"},{"key":"ref130","article-title":"Sora: A review on background, technology, limitations, and opportunities of large vision models","author":"Liu","year":"2024"},{"key":"ref131","article-title":"Fewer truncations improve language modeling","author":"Ding","year":"2024"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i24.34706"},{"key":"ref133","article-title":"Introducing our multimodal models","author":"Bavishi","year":"2023"},{"key":"ref134","article-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets","author":"Blattmann","year":"2023"},{"key":"ref135","article-title":"AlignGPT: Multi-modal large language models with adaptive alignment capability","author":"Zhao","year":"2024"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00726"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02526"},{"key":"ref139","first-page":"52233","article-title":"Freebind: Free lunch in unified multimodal space via knowledge fusion","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang"},{"key":"ref140","article-title":"LanguageBind: Extending video-language pretraining to N-modality by language-based semantic alignment","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zhu"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02488"},{"key":"ref142","article-title":"$\\mathbb {D}^{2}$D2 pruning: Message passing for balancing diversity & difficulty in data pruning","author":"Maharana","year":"2024","journal-title":"ICLR"},{"key":"ref143","article-title":"Data mixing made efficient: A bivariate scaling law for language model pretraining","author":"Ge","year":"2024"},{"key":"ref144","article-title":"Set-of-mark prompting unleashes extraordinary visual grounding in GPT-4V","author":"Yang","year":"2023"},{"key":"ref145","article-title":"Draw-and-understand: Leveraging visual prompts to enable MLLMs to comprehend what you want","author":"Lin","year":"2024"},{"key":"ref146","article-title":"Shikra: Unleashing multimodal LLM\u2019s referential dialogue magic","author":"Chen","year":"2023"},{"key":"ref147","article-title":"Kosmos-2: Grounding multimodal large language models to the world","author":"Peng","year":"2023"},{"key":"ref148","article-title":"Time-LLM: Time series forecasting by reprogramming large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Jin"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00705"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01365"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.360"},{"key":"ref152","article-title":"MMICL: Empowering vision-language model with multi-modal in-context learning","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zhao"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-93806-1_19"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i3.32316"},{"key":"ref155","article-title":"All in an aggregated image for in-image learning","author":"Wang","year":"2024"},{"key":"ref156","article-title":"Hallucination of multimodal large language models: A survey","author":"Bai","year":"2024"},{"key":"ref157","first-page":"26650","article-title":"Lamm: Language-assisted multi-modal instruction-tuning dataset, framework, and benchmark","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Yin"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.naacl-long.256"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01350"},{"key":"ref160","article-title":"Silkie: Preference distillation for large visual language models","author":"Li","year":"2023","journal-title":"arXiv:2405.2312.10665"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01310"},{"key":"ref162","article-title":"Automated multi-level preference for MLLMs","author":"Zhang","year":"2024"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.775"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01274"},{"key":"ref165","article-title":"Mitigating hallucination in large multi-modal models via robust instruction tuning","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Liu"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73232-4_3"},{"key":"ref167","article-title":"MedThink: Explaining medical visual question answering via multimodal decision-making rationale","author":"Gai","year":"2024"},{"key":"ref168","first-page":"5168","article-title":"DDCoT: Duty-distinct chain-of-thought prompting for multimodal reasoning in language models","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Zheng"},{"key":"ref169","article-title":"Language-image models with 3D understanding","author":"Cho","year":"2024"},{"key":"ref170","article-title":"Grounding-prompter: Prompting LLM with multimodal information for temporal sentence grounding in long videos","author":"Chen","year":"2023"},{"key":"ref171","article-title":"MM-react: Prompting chatGPT for multimodal reasoning and action","author":"Yang","year":"2023"},{"key":"ref172","first-page":"43447","article-title":"Chameleon: Plug-and-play compositional reasoning with large language models","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Lu"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01436"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.1109\/SMC54092.2024.10831129"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.895"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72643-9_23"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2025.3581811"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00016"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00395"},{"key":"ref180","article-title":"SA-attack: Improving adversarial transferability of vision-language pre-training models via self-augmentation","author":"He","year":"2023"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.naacl-long.360"},{"key":"ref182","article-title":"(Ab) using images and sounds for indirect instruction injection in multi-modal LLMs","author":"Bagdasaryan","year":"2023"},{"key":"ref183","article-title":"The wolf within: Covert injection of malice into MLLM societies via an MLLM operative","author":"Tan","year":"2024"},{"key":"ref184","article-title":"Jailbreaking GPT-4V via self-adversarial attacks with system prompts","author":"Wu","year":"2023"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-025-02368-9"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72992-8_22"},{"key":"ref187","article-title":"Med-MMHL: A multi-modal dataset for detecting human-and LLM-generated misinformation in the medical domain","author":"Sun","year":"2023"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.1109\/SP46215.2023.10179300"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.198"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.1145\/3589132.3625611"},{"key":"ref191","article-title":"GPT-4 technical report","author":"Achiam","year":"2023"},{"key":"ref192","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3362821"},{"key":"ref193","article-title":"Safeguarding data in multimodal AI: A differentially private approach to clip training","author":"Huang","year":"2023"},{"key":"ref194","first-page":"1273","article-title":"Communication-efficient learning of deep networks from decentralized data","volume-title":"Proc. Int. Conf. Artif. Intell. Statist.","author":"McMahan"},{"key":"ref195","first-page":"41473","article-title":"Federated full-parameter tuning of billion-sized language models with communication cost under 18 kilobytes","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Qin"},{"key":"ref196","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671865"},{"key":"ref197","article-title":"Federated fine-tuning of large language models under heterogeneous language tasks and client resources","author":"Bai","year":"2024"},{"key":"ref198","first-page":"55320","article-title":"Ethical considerations for responsible data curation","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Andrews"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1109\/MC.2020.3011082"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.1145\/1858996.1859088"},{"key":"ref201","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645520"},{"key":"ref202","article-title":"Watermarking vision-language pre-trained models for multi-modal embedding as a service","author":"Tang","year":"2023"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73337-6_9"},{"key":"ref204","article-title":"UNIAA: A unified multi-modal image aesthetic assessment baseline and benchmark","author":"Zhou","year":"2024"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00673"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2024\/189"},{"key":"ref207","article-title":"M3DBench: Let\u2019s instruct large models with multi-modal 3D prompts","author":"Li","year":"2023"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02095"},{"key":"ref209","article-title":"Seed-Bench: Benchmarking multimodal LLMs with generative comprehension","author":"Li","year":"2023"},{"key":"ref210","article-title":"OpenLEAF: Open-domain interleaved image-text generation and evaluation","author":"An","year":"2023"},{"key":"ref211","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02060"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02090"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681488"},{"key":"ref214","article-title":"MLLM-as-a-judge: Assessing multimodal LLM-as-a-judge with vision-language benchmark","author":"Chen","year":"2024"},{"key":"ref215","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.573"},{"key":"ref216","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.178"},{"key":"ref217","article-title":"Jailbreaking attack against multimodal large language model","author":"Niu","year":"2024"},{"key":"ref218","article-title":"MMT-bench: A comprehensive multimodal benchmark for evaluating large vision-language models towards multitask AGI","author":"Ying","year":"2024"},{"key":"ref219","article-title":"ChartX & chartVLM: A versatile benchmark and foundation model for complicated chart reasoning","author":"Xia","year":"2024"},{"key":"ref220","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02064"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.446"},{"key":"ref223","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00161"},{"key":"ref224","article-title":"Sight beyond text: Multi-modal training enhances LLMs in truthfulness and ethics","author":"Tu","year":"2023"},{"key":"ref225","article-title":"Data efficient evaluation of large language models and text-to-image models via adaptive sampling","author":"Xu","year":"2024"},{"key":"ref226","article-title":"ChartLlama: A multimodal LLM for chart understanding and generation","author":"Han","year":"2023"},{"key":"ref227","article-title":"CompoDiff: Versatile composed image retrieval with latent diffusion","author":"Gu","year":"2023"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28334"},{"key":"ref229","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"ref230","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i4.28081"},{"key":"ref231","article-title":"Aligned with LLM: A new multi-modal training paradigm for encoding fMRI activity in visual cortex","author":"Ma","year":"2024"},{"key":"ref232","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.85"},{"key":"ref233","doi-asserted-by":"publisher","DOI":"10.1145\/3640543.3645174"},{"key":"ref234","article-title":"Model-in-the-loop (MILO): Accelerating multimodal AI data annotation with LLMs","author":"Wang","year":"2024"},{"key":"ref235","article-title":"Data-juicer sandbox: A feedback-driven suite for multimodal data-model co-development","author":"Chen","year":"2024"},{"key":"ref236","article-title":"ZooProbe: A data engine for evaluating, exploring, and evolving large-scale training data for multimodal LLMs","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zhang"},{"key":"ref237","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72904-1_9"},{"key":"ref238","article-title":"Fakeshield: Explainable image forgery detection and localization via multi-modal large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Xu"},{"key":"ref239","doi-asserted-by":"publisher","DOI":"10.1145\/3748304"},{"key":"ref240","doi-asserted-by":"publisher","DOI":"10.1109\/aixmm62960.2025.00008"},{"key":"ref241","doi-asserted-by":"publisher","DOI":"10.14778\/3611479.3611527"},{"key":"ref242","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29837"},{"key":"ref243","article-title":"Are LLMs ready for real-world materials discovery?","author":"Miret","year":"2024"},{"key":"ref244","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.305"},{"key":"ref245","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00188"},{"key":"ref246","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01284"},{"key":"ref247","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2025.103124"},{"key":"ref248","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.579"},{"key":"ref249","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.899"},{"issue":"1","key":"ref250","first-page":"176","article-title":"Robustness of structured data extraction from in-plane rotated documents using multi-modal large language models (LLM)","volume":"4","author":"Biswas","year":"2024","journal-title":"J. Artif. Intell. Res."},{"key":"ref251","article-title":"Structured entity extraction using large language models","author":"Wu","year":"2024"},{"key":"ref252","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.76"},{"key":"ref253","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.548"},{"key":"ref254","doi-asserted-by":"publisher","DOI":"10.3390\/app132212208"},{"key":"ref255","article-title":"ChatGPT as your personal data scientist","author":"Hassan","year":"2023"},{"key":"ref256","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.637"},{"key":"ref257","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.175"},{"key":"ref258","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73414-4_14"},{"key":"ref259","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681294"},{"key":"ref260","article-title":"Bridging research and readers: A multi-modal automated academic papers interpretation system","author":"Jiang","year":"2024"},{"key":"ref261","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-naacl.125"},{"key":"ref262","article-title":"Daco: Towards application-driven and comprehensive data analysis via code generation","author":"Wu","year":"2024"},{"key":"ref263","article-title":"PosterLLaVA: Constructing a unified multi-modal layout generator with LLM","author":"Yang","year":"2024"},{"key":"ref264","article-title":"Data-copilot: Bridging billions of data and humans with autonomous workflow","author":"Zhang","year":"2023"},{"key":"ref265","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-demo.11"},{"key":"ref266","doi-asserted-by":"publisher","DOI":"10.1109\/PacificVis60374.2024.00049"},{"key":"ref267","doi-asserted-by":"publisher","DOI":"10.1145\/3654992"},{"key":"ref268","article-title":"LLMs meet multimodal generation and editing: A survey","author":"He","year":"2024"},{"key":"ref269","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01265"},{"key":"ref270","article-title":"InternVid: A large-scale video-text dataset for multimodal understanding and generation","author":"Wang","year":"2023"},{"key":"ref271","article-title":"ShareGPT4video: Improving video understanding and generation with better captions","author":"Chen","year":"2024"},{"key":"ref272","article-title":"M $^{3}$3 it: A large-scale dataset towards multi-modal multilingual instruction tuning","author":"Li","year":"2023"},{"key":"ref273","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.463"},{"key":"ref274","article-title":"Youku-mPLUG: A 10 million large-scale chinese video-language dataset for pre-training and benchmarks","author":"Xu","year":"2023"},{"key":"ref275","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01428"},{"key":"ref276","article-title":"Multimodal C4: An open, billion-scale corpus of images interleaved with text","author":"Zhu","year":"2023"},{"key":"ref277","first-page":"42748","article-title":"Perception test: A diagnostic benchmark for multimodal video models","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Patraucean"},{"key":"ref278","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73242-3_10"},{"key":"ref279","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00454"},{"key":"ref280","article-title":"Constitutional AI: Harmlessness from AI feedback","author":"Bai","year":"2022"},{"key":"ref281","first-page":"26874","article-title":"RLAIF vs. RLHF: Scaling reinforcement learning from human feedback with AI feedback","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Lee"},{"key":"ref282","article-title":"RLAIF-V: Aligning MLLMs through open-source ai feedback for super GPT-4V trustworthiness","author":"Yu","year":"2024"},{"key":"ref283","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01835"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/11163533\/11027559.pdf?arnumber=11027559","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,18]],"date-time":"2025-09-18T20:04:50Z","timestamp":1758225890000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11027559\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10]]},"references-count":283,"journal-issue":{"issue":"10"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3576835","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,10]]}}}