{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T17:55:51Z","timestamp":1775325351382,"version":"3.50.1"},"reference-count":108,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62422204"],"award-info":[{"award-number":["62422204"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62125201"],"award-info":[{"award-number":["62125201"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072147"],"award-info":[{"award-number":["62072147"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62020106007"],"award-info":[{"award-number":["62020106007"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62372147"],"award-info":[{"award-number":["62372147"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62206082"],"award-info":[{"award-number":["62206082"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U21B2040"],"award-info":[{"award-number":["U21B2040"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["LR22F020001"],"award-info":[{"award-number":["LR22F020001"]}]},{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["LDT23F02025F02"],"award-info":[{"award-number":["LDT23F02025F02"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Multimedia"],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/tmm.2025.3557680","type":"journal-article","created":{"date-parts":[[2025,4,11]],"date-time":"2025-04-11T18:04:23Z","timestamp":1744394663000},"page":"2961-2974","source":"Crossref","is-referenced-by-count":7,"title":["Imp: Highly Capable Large Multimodal Models for Mobile Devices"],"prefix":"10.1109","volume":"27","author":[{"given":"Zhenwei","family":"Shao","sequence":"first","affiliation":[{"name":"Key Laboratory of Complex Systems Modeling and Simulation, School of Computer Science, Hangzhou Dianzi University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8407-1137","authenticated-orcid":false,"given":"Zhou","family":"Yu","sequence":"additional","affiliation":[{"name":"Key Laboratory of Complex Systems Modeling and Simulation, School of Computer Science, Hangzhou Dianzi University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1922-7283","authenticated-orcid":false,"given":"Jun","family":"Yu","sequence":"additional","affiliation":[{"name":"School of Intelligence Science and Engineering, Harbin Institute of Technology Shenzhen, Shenzhen, China"}]},{"given":"Xuecheng","family":"Ouyang","sequence":"additional","affiliation":[{"name":"HDU-ITMO Joint Institute, Hangzhou Dianzi University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0999-7030","authenticated-orcid":false,"given":"Lihao","family":"Zheng","sequence":"additional","affiliation":[{"name":"Key Laboratory of Complex Systems Modeling and Simulation, School of Computer Science, Hangzhou Dianzi University, Hangzhou, China"}]},{"given":"Zhenbiao","family":"Gai","sequence":"additional","affiliation":[{"name":"Key Laboratory of Complex Systems Modeling and Simulation, School of Computer Science, Hangzhou Dianzi University, Hangzhou, China"}]},{"given":"Mingyang","family":"Wang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Complex Systems Modeling and Simulation, School of Computer Science, Hangzhou Dianzi University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9813-7037","authenticated-orcid":false,"given":"Zhenzhong","family":"Kuang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Complex Systems Modeling and Simulation, School of Computer Science, Hangzhou Dianzi University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7497-7485","authenticated-orcid":false,"given":"Jiajun","family":"Ding","sequence":"additional","affiliation":[{"name":"Key Laboratory of Complex Systems Modeling and Simulation, School of Computer Science, Hangzhou Dianzi University, Hangzhou, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2018.2852750"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2023.3339628"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2023.xix.025"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/taslp.2021.3122291"},{"issue":"8","key":"ref8","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"ref9","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Int. Conf. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Brown","year":"2020"},{"issue":"240","key":"ref10","first-page":"1","article-title":"PaLM: Scaling language modeling with pathways","volume":"24","author":"Chowdhery","year":"2023","journal-title":"J. Mach. Learn. Res."},{"key":"ref11","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.2307\/j.ctv1tjrvk1.47"},{"key":"ref13","article-title":"Emergent abilities of large language models","author":"Wei","year":"2022","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref14","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume-title":"Proc. Int. Conf. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Ouyang","year":"2022"},{"key":"ref15","first-page":"53728","article-title":"Direct preference optimization: Your language model is secretly a reward model","volume-title":"Proc. Int. Conf. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Rafailov","year":"2024"},{"key":"ref16","article-title":"GPT-4 technical report","author":"Achiam","year":"2023"},{"key":"ref17","article-title":"Gemini: A family of highly capable multimodal models","year":"2023"},{"key":"ref18","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2021"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2021.3132068"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02224"},{"key":"ref21","article-title":"GPT-4V(ision) system card","year":"2023"},{"key":"ref22","first-page":"34892","article-title":"Visual instruction tuning","volume-title":"Proc. Int. Conf. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Liu","year":"2023"},{"key":"ref23","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","volume-title":"Proc. 12th Int. Conf. Learn. Representations","author":"Zhu","year":"2024"},{"key":"ref24","article-title":"Qwen-VL: A frontier large vision-language model with versatile abilities","author":"Bai","year":"2023"},{"key":"ref25","first-page":"121475","article-title":"COGVLM: Visual expert for pretrained language models","volume-title":"Proc. Int. Conf. Adv. Neural Inf. Process. Syst.","author":"Wang","year":"2024"},{"key":"ref26","article-title":"VideoChat: Chat-centric video understanding","author":"Li","year":"2023"},{"key":"ref27","first-page":"18090","article-title":"Pengi: An audio language model for audio tasks","volume-title":"Proc. Int. Conf. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Deshmukh","year":"2023"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.1055"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"ref30","article-title":"Macaw-LLM: Multi-modal language modeling with image, audio, video, and text integration","author":"Lyu","year":"2023"},{"key":"ref31","article-title":"LanguageBind: Extending video-language pretraining to n-modality by language-based semantic alignment","volume-title":"Proc. 12th Int. Conf. Learn. Representations","author":"Zhu","year":"2024"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.01457"},{"key":"ref33","first-page":"11","article-title":"PandaGPT: One model to instruction-follow them all","volume-title":"Proc. 1st Workshop Taming Large Lang. Models: Controllability Era Interactive Assistants","author":"Su","year":"2023"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1631\/fitee.2400250"},{"key":"ref35","article-title":"Look before you leap: Unveiling the power of GPT-4V in robotic vision-language planning","volume-title":"Proc. 1st Workshop Vis.- Lang. Models Navigation Manipulation ICRA","author":"Hu","year":"2024"},{"key":"ref36","article-title":"Mobile-agent: Autonomous multi-modal mobile device agent with visual perception","volume-title":"Proc. Workshop Large Lang. Model Agents","author":"Wang","year":"2024"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"ref38","article-title":"LLaMA 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2017.670"},{"issue":"3","key":"ref41","article-title":"Phi-2: The surprising power of small language models","volume":"1","author":"Javaheripi","year":"2023","journal-title":"Microsoft Res. Blog"},{"key":"ref42","article-title":"MiniCPM: Unveiling the potential of small language models with scalable training strategies","volume-title":"Proc. 1st Conf. Lang. Model.","author":"Hu","year":"2024"},{"key":"ref43","first-page":"2024","article-title":"TinyGPT-V: Efficient multimodal large language model via small backbones","volume-title":"Proc. 2nd Workshop Advancing Neural Netw. Training: Comput. Efficiency, Scalability, Resour. Optim.","author":"Yuan"},{"key":"ref44","article-title":"Small language model meets with reinforced vision vocabulary","author":"Wei","year":"2024"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.599"},{"key":"ref46","article-title":"MiniCPM-V: A GPT-4V level MLLM on your phone","author":"Yao","year":"2024"},{"key":"ref47","article-title":"Efficient multimodal learning from data-centric perspective","author":"He","year":"2024"},{"key":"ref48","first-page":"23123","article-title":"Prismatic VLMs: Investigating the design space of visually-conditioned language models","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","author":"Karamcheti","year":"2024"},{"key":"ref49","article-title":"Qwen technical report","author":"Bai","year":"2023"},{"key":"ref50","article-title":"Phi-3 technical report: A highly capable language model locally on your phone","author":"Abdin","year":"2024"},{"key":"ref51","article-title":"Improving language understanding by generative pre-training","author":"Radford","year":"2018"},{"issue":"1","key":"ref52","first-page":"5485","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref53","article-title":"ChatGPT: Optimizing language models for dialogue","year":"2023"},{"key":"ref54","article-title":"Baichuan 2: Open large-scale language models","author":"Yang","year":"2023"},{"key":"ref55","article-title":"Visual ChatGPT: Talking, drawing and editing with visual foundation models","author":"Wu","year":"2023"},{"key":"ref56","article-title":"MM-React: Prompting ChatGPT for multimodal reasoning and action","author":"Yang","year":"2023"},{"key":"ref57","first-page":"38154","article-title":"HuggingGPT: Solving AI tasks with ChatGPT and its friends in huggingface","volume-title":"Proc. Int. Conf. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Shen","year":"2024"},{"key":"ref58","first-page":"23716","article-title":"Flamingo: A visual language model for few-shot learning","volume-title":"Proc. Int. Conf. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Alayrac","year":"2022"},{"key":"ref59","first-page":"19730","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li","year":"2023"},{"key":"ref60","first-page":"49250","article-title":"Instructblip: Towards general-purpose vision-language models with instruction tuning","volume":"36","author":"Dai","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"ref61","article-title":"Shikra: Unleashing multimodal LLM\u2019s referential dialogue magic","author":"Chen","year":"2023"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02520"},{"key":"ref63","first-page":"60116","article-title":"Next-chat: An LMM for chat, detection and segmentation","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","volume":"235","author":"Zhang","year":"2024"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02283"},{"key":"ref65","first-page":"55976","article-title":"DoraemonGPT: Toward understanding dynamic scenes with large language models (exemplified as a video agent)","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","volume":"235","author":"Yang","year":"2024"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"ref67","article-title":"LLaVA-OneVision: Easy visual task transfer","author":"Li","year":"2025","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref68","article-title":"Efficient multimodal large language models: A survey","author":"Jin","year":"2024"},{"key":"ref69","article-title":"Textbooks are all you need","author":"Gunasekar","year":"2023"},{"key":"ref70","article-title":"Gemma: Open models based on gemini research and technology","author":"Gemma Team","year":"2024"},{"key":"ref71","article-title":"TinyLLaMA: An open-source small language model","author":"Zhang","year":"2024"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1145\/3688863.3689575"},{"key":"ref73","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy","year":"2021"},{"key":"ref74","article-title":"Vicuna: An open-source chatbot impressing GPT-4 with 90 ChatGPT quality","author":"Chiang","year":"2023"},{"key":"ref75","article-title":"LORA: Low-rank adaptation of large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Hu","year":"2022"},{"key":"ref76","article-title":"Scaling laws for neural language models","author":"Kaplan","year":"2020"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2019.00686"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00851"},{"key":"ref79","first-page":"2507","article-title":"Learn to explain: Multimodal reasoning via thought chains for science question answering","volume-title":"Proc. Int. Conf. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Lu","year":"2022"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"ref83","article-title":"DeepSeek-Vl: Towards real-world vision-language understanding","author":"Lu","year":"2024"},{"key":"ref84","article-title":"LLaVA-next: Improved reasoning, OCR, and world knowledge","author":"Liu","year":"2024"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_44"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2018.00592"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/wacv48630.2021.00225"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_15"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00264"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"ref92","article-title":"LAION-GPT-V","year":"2023"},{"key":"ref93","article-title":"ALLaVA: Harnessing GPT4V-synthesized data for a lite vision-language model","author":"Chen","year":"2024"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2018.00380"},{"key":"ref95","article-title":"MME: A comprehensive evaluation benchmark for multimodal large language models","author":"Fu","year":"2023"},{"key":"ref96","article-title":"MM-Vet: Evaluating large multimodal models for integrated capabilities","author":"Yu","year":"2023"},{"key":"ref97","article-title":"Mini-Gemini: Mining the potential of multi-modality vision language models","author":"Li","year":"2024"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01239"},{"key":"ref99","first-page":"32400","article-title":"Sphinx-X: Scaling data and parameters for a family of multi-modal large language models","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","volume":"235","author":"Liu","year":"2024"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/bdcat63179.2024.00048"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.00913"},{"key":"ref102","article-title":"MathVista: Evaluating mathematical reasoning of foundation models in visual contexts","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Lu","year":"2024"},{"key":"ref103","article-title":"AMBER: An LLM-free multi-dimensional benchmark for MLLMs hallucination evaluation","author":"Wang","year":"2023"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01363"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.775"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02095"},{"key":"ref107","article-title":"Video-MME: The first-ever comprehensive evaluation benchmark of multi-modal LLMs in video analysis","author":"Fu","year":"2024"},{"key":"ref108","article-title":"Video instruction tuning with synthetic data","author":"Zhang","year":"2024"}],"container-title":["IEEE Transactions on Multimedia"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6046\/10844992\/10962548.pdf?arnumber=10962548","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,29]],"date-time":"2025-05-29T17:30:52Z","timestamp":1748539852000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10962548\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":108,"URL":"https:\/\/doi.org\/10.1109\/tmm.2025.3557680","relation":{},"ISSN":["1520-9210","1941-0077"],"issn-type":[{"value":"1520-9210","type":"print"},{"value":"1941-0077","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]}}}