{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,8]],"date-time":"2026-03-08T01:53:39Z","timestamp":1772934819827,"version":"3.50.1"},"reference-count":25,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,8]],"date-time":"2025-12-08T00:00:00Z","timestamp":1765152000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,8]],"date-time":"2025-12-08T00:00:00Z","timestamp":1765152000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100000161","name":"NIST","doi-asserted-by":"publisher","award":["60NANB24D143"],"award-info":[{"award-number":["60NANB24D143"]}],"id":[{"id":"10.13039\/100000161","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100006498","name":"Clemson University, Clemson, South Carolina, USA","doi-asserted-by":"publisher","award":["69A3552344812"],"award-info":[{"award-number":["69A3552344812"]}],"id":[{"id":"10.13039\/100006498","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100010548","name":"National Center for Supercomputing Applications","doi-asserted-by":"publisher","award":["CIS240609"],"award-info":[{"award-number":["CIS240609"]}],"id":[{"id":"10.13039\/100010548","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2138259,2138286,2138307,2137603,2138296"],"award-info":[{"award-number":["2138259,2138286,2138307,2137603,2138296"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,8]]},"DOI":"10.1109\/bigdata66926.2025.11402394","type":"proceedings-article","created":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T20:57:57Z","timestamp":1772830677000},"page":"4544-4551","source":"Crossref","is-referenced-by-count":0,"title":["Vision Token Reduction via Attention-Driven Self-Compression for Efficient Multimodal Large Language Models"],"prefix":"10.1109","author":[{"given":"Omer Faruk","family":"Deniz","sequence":"first","affiliation":[{"name":"The University of Texas at Dallas,Richardson,TX"}]},{"given":"Ruiyu","family":"Mao","sequence":"additional","affiliation":[{"name":"The University of Texas at Dallas,Richardson,TX"}]},{"given":"Ruochen","family":"Li","sequence":"additional","affiliation":[{"name":"The University of Texas at Dallas,Richardson,TX"}]},{"given":"Yapeng","family":"Tian","sequence":"additional","affiliation":[{"name":"The University of Texas at Dallas,Richardson,TX"}]},{"given":"Latifur","family":"Khan","sequence":"additional","affiliation":[{"name":"The University of Texas at Dallas,Richardson,TX"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref2","article-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73004-7_2"},{"key":"ref4","article-title":"Token merging for multimodal large language models","author":"Ge","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref5","article-title":"Sparsevlm: Visual token sparsification for efficient vision-language model inference","volume-title":"International Conference on Machine Learning","author":"Zhang","year":"2025"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref7","first-page":"19730","article-title":"Blip-2: Bootstrapping languageimage pre-training with frozen image encoders and large language models","volume-title":"International conference on machine learning. PMLR","author":"Li","year":"2023"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i2.32229"},{"key":"ref9","article-title":"Prunevid: Visual token pruning for efficient video large language models","author":"Huang","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref10","article-title":"Multi-stage vision token dropping: Towards efficient multimodal large language model","author":"Liu","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1189"},{"key":"ref12","article-title":"Gpt-4 technical report","author":"Achiam","year":"2023","journal-title":"arXiv"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"ref14","article-title":"Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond","author":"Bai","year":"2023","journal-title":"arXiv"},{"key":"ref15","article-title":"Gpt3. int8 (): 8-bit matrix multiplication for transformers at scale","author":"Dettmers","year":"2022","journal-title":"NeurIPS"},{"key":"ref16","first-page":"21702","article-title":"Llm-pruner: On the structural pruning of large language models","volume":"36","author":"Ma","year":"2023","journal-title":"NeurIPS"},{"key":"ref17","article-title":"Minillm: Knowledge distillation of large language models","author":"Gu","year":"2023","journal-title":"arXiv"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3688863.3689575"},{"key":"ref19","article-title":"Dynamicvit: Efficient vision transformers with dynamic token sparsification","author":"Rao","year":"2021","journal-title":"NeurIPS"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20083-0_37"},{"key":"ref21","article-title":"Token merging: Your vit but faster","author":"Bolya","year":"2022","journal-title":"arXiv"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_5"},{"key":"ref23","article-title":"Llava-prumerge: Adaptive token reduction for efficient large multimodal models","author":"Shang","year":"2024","journal-title":"arXiv"},{"key":"ref24","article-title":"Plphp: Per-layer per-head vision token pruning for efficient large visionlanguage models","author":"Meng","year":"2025","journal-title":"arXiv preprint arXiv"},{"key":"ref25","article-title":"Pyramiddrop: Accelerating your large vision-language models via pyramid visual redundancy reduction","author":"Xing","year":"2024","journal-title":"arXiv preprint arXiv"}],"event":{"name":"2025 IEEE International Conference on Big Data (BigData)","location":"Macau, China","start":{"date-parts":[[2025,12,8]]},"end":{"date-parts":[[2025,12,11]]}},"container-title":["2025 IEEE International Conference on Big Data (BigData)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11400704\/11400712\/11402394.pdf?arnumber=11402394","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T07:17:50Z","timestamp":1772867870000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11402394\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,8]]},"references-count":25,"URL":"https:\/\/doi.org\/10.1109\/bigdata66926.2025.11402394","relation":{},"subject":[],"published":{"date-parts":[[2025,12,8]]}}}