{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T10:11:05Z","timestamp":1777889465859,"version":"3.51.4"},"reference-count":57,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62306303,62476265"],"award-info":[{"award-number":["62306303,62476265"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.00038","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"329-339","source":"Crossref","is-referenced-by-count":1,"title":["ShortV: Efficient Multimodal Large Language Models by Freezing Visual Tokens in Ineffective Layers"],"prefix":"10.1109","author":[{"given":"Qianhao","family":"Yuan","sequence":"first","affiliation":[{"name":"Institute of Software, Chinese Academy of Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qingyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute of Software, Chinese Academy of Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yanjiang","family":"Liu","sequence":"additional","affiliation":[{"name":"Institute of Software, Chinese Academy of Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiawei","family":"Chen","sequence":"additional","affiliation":[{"name":"Institute of Software, Chinese Academy of Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yaojie","family":"Lu","sequence":"additional","affiliation":[{"name":"Institute of Software, Chinese Academy of Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hongyu","family":"Lin","sequence":"additional","affiliation":[{"name":"Institute of Software, Chinese Academy of Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jia","family":"Zheng","sequence":"additional","affiliation":[{"name":"Institute of Software, Chinese Academy of Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xianpei","family":"Han","sequence":"additional","affiliation":[{"name":"Institute of Software, Chinese Academy of Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Le","family":"Sun","sequence":"additional","affiliation":[{"name":"Institute of Software, Chinese Academy of Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Gpt-4 technical report","author":"Achiam","year":"2023","journal-title":"arXiv preprint"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref3","article-title":"Openflamingo: An opensource framework for training large autoregressive visionlanguage models","author":"Awadalla","year":"2023","journal-title":"arXiv preprint"},{"key":"ref4","author":"Bavishi","year":"2023","journal-title":"Introducing our multimodal models"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01311"},{"key":"ref6","article-title":"Auroracap: Efficient, performant video detailed captioning and a new benchmark","author":"Chai","year":"2024","journal-title":"arXiv preprint"},{"key":"ref7","article-title":"Minigpt-v2: large language model as a unified interface for vision-language multi-task learning","author":"Chen","year":"2023","journal-title":"arXiv preprint"},{"key":"ref8","article-title":"Evlm: An efficient vision-language model for visual understanding","author":"Chen","year":"2024","journal-title":"arXiv preprint"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0850"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73004-7_2"},{"key":"ref11","article-title":"Streamlining redundant layers to compress large language models","author":"Chen","year":"2024","journal-title":"arXiv preprint"},{"issue":"3","key":"ref12","article-title":"Vicuna: An open-source chatbot impressing gpt-4 with $90 \\%^{*}$ chatgpt quality","volume":"2","author":"Chiang","year":"2023","journal-title":"See"},{"key":"ref13","article-title":"Nvlm: Open frontier-class multimodal 11 ms","author":"Dai","year":"2024","journal-title":"arXiv preprint"},{"key":"ref14","article-title":"An image is worth $16 \\times 16$ words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2010","journal-title":"arXiv preprint"},{"key":"ref15","author":"Fu","year":"2024","journal-title":"Mme: A comprehensive evaluation benchmark for multimodal large language models"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref17","article-title":"What matters in transformers? not all attention is needed","author":"He","year":"2024","journal-title":"arXiv preprint"},{"key":"ref18","article-title":"Deciphering cross-modal alignment in large vision-language models with modality integration rate","author":"Huang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"ref20","volume-title":"Introducing idefics: An open reproduction of state-of-the-art visual language model","year":"2023"},{"key":"ref21","author":"Ilharco","year":"2021","journal-title":"Openclip"},{"key":"ref22","first-page":"4651","article-title":"Perceiver: General perception with iterative attention","volume-title":"International conference on machine learning","author":"Jaegle"},{"key":"ref23","article-title":"Shortened llama: Depth pruning for large language models with comparison of retraining methods","author":"Kim","year":"2024","journal-title":"arXiv preprint"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2789"},{"key":"ref25","article-title":"Seed-bench: Benchmarking multimodal llms with generative comprehension","author":"Li","year":"2023","journal-title":"arXiv preprint"},{"key":"ref26","article-title":"Tokenpacker: Efficient visual projector for multimodal 11 m","author":"Li","year":"2024","journal-title":"arXiv preprint"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72952-2_19"},{"key":"ref28","article-title":"Sphinx: The joint mixing of weights, tasks, and visual embeddings for multi-modal large language models","author":"Lin","year":"2023","journal-title":"arXiv preprint"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i5.32567"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"ref32","author":"Liu","year":"2024","journal-title":"Llava-next: Improved reasoning, ocr, and world knowledge"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"ref35","article-title":"Deepseek-vl: towards real-world visionlanguage understanding","author":"Lu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref36","volume-title":"Shortgpt: Layers in large language models are more redundant than you expect","author":"Men","year":"2024"},{"issue":"5","key":"ref37","article-title":"Introducing meta llama 3: The most capable openly available 11 m to date","volume":"2","author":"Meta","year":"2024","journal-title":"Meta AI"},{"key":"ref38","volume-title":"Gpt-4v(ision) system card","year":"2023"},{"key":"ref39","year":"2024","journal-title":"Introducing gpt-4o: our fastest and most affordable flagship model"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"ref41","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01969"},{"key":"ref43","article-title":"Llava-prumerge: Adaptive token reduction for efficient large multimodal models","author":"Shang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2025.3604614"},{"key":"ref46","article-title":"Sleb: Streamlining llms through redundancy verification and elimination of transformer blocks","author":"Song","year":"2024","journal-title":"arXiv preprint"},{"key":"ref47","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv preprint"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3860"},{"key":"ref49","article-title":"Baichuan 2: Open large-scale language models","author":"Yang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.372"},{"key":"ref51","article-title":"mplugowl3: Towards long image-sequence understanding in multimodal large language models","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Ye"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01239"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.02777"},{"key":"ref54","article-title":"Saisa: Towards multimodal large language models with both training and inference efficiency","author":"Yuan","year":"2025","journal-title":"arXiv preprint"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"ref57","article-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023","journal-title":"arXiv preprint"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11445347.pdf?arnumber=11445347","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:13:10Z","timestamp":1777612390000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11445347\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":57,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.00038","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}