{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,19]],"date-time":"2026-06-19T02:40:51Z","timestamp":1781836851920,"version":"3.54.5"},"reference-count":76,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,1,31]]},"DOI":"10.1109\/hpca68181.2026.11408525","type":"proceedings-article","created":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T20:47:22Z","timestamp":1772657242000},"page":"1-18","source":"Crossref","is-referenced-by-count":1,"title":["Focus: A Streaming Concentration Architecture for Efficient Vision-Language Models"],"prefix":"10.1109","author":[{"given":"Chiyue","family":"Wei","sequence":"first","affiliation":[{"name":"Duke University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Cong","family":"Guo","sequence":"additional","affiliation":[{"name":"Duke University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Junyao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Duke University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Haoxuan","family":"Shan","sequence":"additional","affiliation":[{"name":"Duke University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yifan","family":"Xu","sequence":"additional","affiliation":[{"name":"Duke University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ziyue","family":"Zhang","sequence":"additional","affiliation":[{"name":"Duke University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yudong","family":"Liu","sequence":"additional","affiliation":[{"name":"Duke University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Qinsi","family":"Wang","sequence":"additional","affiliation":[{"name":"Duke University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Changchun","family":"Zhou","sequence":"additional","affiliation":[{"name":"Duke University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hai Helen","family":"Li","sequence":"additional","affiliation":[{"name":"Duke University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yiran","family":"Chen","sequence":"additional","affiliation":[{"name":"Duke University"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"Flamingo: a visual language model for few-shot learning","author":"Alayrac","year":"2022"},{"key":"ref2","article-title":"Amazon titan multimodal embeddings foundation model","author":"Web Services","year":"2023","journal-title":"Amazon Bedrock User Guide"},{"key":"ref3","article-title":"Qwen2. 5-vl technical report","author":"Bai","year":"2025","journal-title":"arXiv preprint"},{"key":"ref4","article-title":"Token merging: Your vit but faster","author":"Bolya","year":"2023","journal-title":"in ICLR"},{"key":"ref5","article-title":"An introduction to vision-language modeling","author":"Bordes","year":"2024","journal-title":"arXiv preprint"},{"key":"ref6","article-title":"Language models are few-shot learners","author":"Brown","year":"2020","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73004-7_2"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001177"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00084"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3695053.3731024"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3061394"},{"key":"ref12","author":"Corporation","year":"2023","journal-title":"Jetson orin nano developer kit"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/wacv61041.2025.00550"},{"key":"ref14","article-title":"8-bit optimizers via block-wise quantization","author":"Dettmers","year":"2021","journal-title":"arXiv preprint"},{"key":"ref15","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018","journal-title":"arXiv preprint"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071047"},{"key":"ref17","article-title":"An image is worth $16 \\times 16$ words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020","journal-title":"arXiv preprint"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750389"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02245"},{"key":"ref20","article-title":"Framefusion: Combining similarity and importance for video token reduction on large visual language models","author":"Fu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00020"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589038"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3695053.3731043"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00095"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00086"},{"key":"ref28","article-title":"Dynamic-llava: Efficient multimodal large language models via dynamic vision-language context sparsification","author":"Huang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.1024"},{"issue":"2","key":"ref30","first-page":"3","volume":"1","author":"Intelligence","journal-title":"\u00b5 0.5: a vision-languageaction model with open-world generalization, 2025"},{"key":"ref31","article-title":"TPU v4: An optically reconfigurable supercomputer for machine learning with hardware support for embeddings","volume":"abs\/2304.01433","author":"Jouppi","year":"2023","journal-title":"CoRR"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"ref33","article-title":"Openvla: An opensource vision-language-action model","author":"Kim","year":"2024","journal-title":"arXiv preprint"},{"key":"ref34","first-page":"256","article-title":"Systolic arrays (for vlsi)","volume-title":"Sparse Matrix Proceedings 1978","volume":"1","author":"Kung"},{"key":"ref35","article-title":"Llava-onevision: Easy visual task transfer","author":"Li","year":"2024","journal-title":"arXiv preprint"},{"key":"ref36","article-title":"Blip: Bootstrapping languageimage pre-training for unified vision-language understanding and generation","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Li"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2020.2973991"},{"key":"ref39","article-title":"Twilight: Adaptive attention sparsity with hierarchical top- $p$ pruning","author":"Lin","year":"2025","journal-title":"arXiv preprint"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"ref41","article-title":"Visual instruction tuning","volume-title":"Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023","author":"Liu","year":"2023"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"ref43","first-page":"20802","article-title":"Keyframe-oriented vision token pruning: Enhancing efficiency of large vision language models on long-form video processing","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Liu"},{"key":"ref44","volume-title":"Nvila: Efficient frontier visual language models","author":"Liu","year":"2025"},{"key":"ref45","article-title":"Vision language model-based caption evaluation method leveraging visual context extraction","volume":"abs\/2402.17969","author":"Maeda","year":"2024","journal-title":"C o R R"},{"key":"ref46","article-title":"Nvidia a100 tensor core gpu architecture","year":"2020","journal-title":"NVIDIA Corporation, White Paper"},{"key":"ref47","volume-title":"Gpt-4 technical report","author":"Achiam","year":"2024"},{"key":"ref48","article-title":"Pytorch: An imperative style, highperformance deep learning library","volume":"32","author":"Paszke","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref49","first-page":"11974","article-title":"Dynamicvit: Efficient vision transformers with dynamic token sparsification","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV)","author":"Rao"},{"key":"ref50","article-title":"Token pruning using a lightweight background aware vision transformer","volume":"abs\/2410.09324","author":"Sah","year":"2024","journal-title":"CoRR"},{"key":"ref51","article-title":"Scale-sim: Systolic cnn accelerator simulator","author":"Samajdar","year":"2018","journal-title":"arXiv preprint"},{"key":"ref52","article-title":"Llava-prumerge: Adaptive token reduction for efficient large multimodal models","author":"Shang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2025.3579032"},{"key":"ref54","article-title":"Fastvid: Dynamic density pruning for fast video large language models","author":"Shen","year":"2025","journal-title":"arXiv preprint"},{"key":"ref55","article-title":"Guiding vision-language model selection for visual question-answering across tasks, domains, and knowledge types","volume":"abs\/2409.09269","author":"Sinha","year":"2024","journal-title":"CoRR"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640393"},{"key":"ref57","author":"Team","year":"2024","journal-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context"},{"key":"ref58","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv preprint"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00111"},{"key":"ref62","article-title":"Corematching: A co-adaptive sparse inference framework with token and neuron pruning for comprehensive acceleration of vision-language models","volume":"abs\/2505.19235","author":"Wang","year":"2025","journal-title":"CoRR"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1145\/3695053.3731035"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00066"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2003.815165"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"ref68","article-title":"Minicpm-v: A gpt-4v level mllm on your phone","author":"Yao","year":"2024","journal-title":"arXiv preprint"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.02777"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00015"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071027"},{"key":"ref72","article-title":"Root mean square layer normalization","volume":"32","author":"Zhang","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref73","volume-title":"Lmms-eval: Reality check on the evaluation of large multimodal models","author":"Zhang","year":"2024"},{"key":"ref74","article-title":"Video instruction tuning with synthetic data","author":"Zhang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref75","article-title":"Mme: A comprehensive evaluation benchmark for multimodal large language models","volume":"18","author":"Zhang","year":"2021","journal-title":"arXiv preprint"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01278"}],"event":{"name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","location":"Sydney, Australia","start":{"date-parts":[[2026,1,31]]},"end":{"date-parts":[[2026,2,4]]}},"container-title":["2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11408404\/11408433\/11408525.pdf?arnumber=11408525","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,17]],"date-time":"2026-03-17T20:17:50Z","timestamp":1773778670000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11408525\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,31]]},"references-count":76,"URL":"https:\/\/doi.org\/10.1109\/hpca68181.2026.11408525","relation":{},"subject":[],"published":{"date-parts":[[2026,1,31]]}}}