{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:10:44Z","timestamp":1765311044473,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":87,"publisher":"ACM","funder":[{"name":"State Key Laboratory of General Artificial Intelligence"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755119","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:30:51Z","timestamp":1761377451000},"page":"10935-10944","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Fast3D: Accelerating 3D Multi-modal Large Language Models for Efficient 3D Scene Understanding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1555-3674","authenticated-orcid":false,"given":"Wencan","family":"Huang","sequence":"first","affiliation":[{"name":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8179-4508","authenticated-orcid":false,"given":"Daizong","family":"Liu","sequence":"additional","affiliation":[{"name":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9860-0922","authenticated-orcid":false,"given":"Wei","family":"Hu","sequence":"additional","affiliation":[{"name":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv","author":"Alexey Dosovitskiy","year":"2010","unstructured":"Dosovitskiy Alexey. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv: 2010.11929 (2020)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01854"},{"key":"e_1_3_2_1_4_1","volume-title":"Token merging: Your vit but faster. arXiv preprint arXiv:2210.09461","author":"Bolya Daniel","year":"2022","unstructured":"Daniel Bolya, Cheng-Yang Fu, Xiaoliang Dai, Peizhao Zhang, Christoph Feichtenhofer, and Judy Hoffman. 2022. Token merging: Your vit but faster. arXiv preprint arXiv:2210.09461 (2022)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01487"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01311"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_13"},{"key":"e_1_3_2_1_8_1","volume-title":"Llavolta: Efficient multi-modal models via stage-wise visual context compression. arXiv preprint arXiv:2406.20092","author":"Chen Jieneng","year":"2024","unstructured":"Jieneng Chen, Luoxin Ye, Ju He, Zhao-Yang Wang, Daniel Khashabi, and Alan Yuille. 2024c. Llavolta: Efficient multi-modal models via stage-wise visual context compression. arXiv preprint arXiv:2406.20092 (2024)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73004-7_2"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01574"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02496"},{"key":"e_1_3_2_1_12_1","volume-title":"Language conditioned spatial relation reasoning for 3d object grounding. Advances in neural information processing systems","author":"Chen Shizhe","year":"2022","unstructured":"Shizhe Chen, Pierre-Louis Guhur, Makarand Tapaswi, Cordelia Schmid, and Ivan Laptev. 2022. Language conditioned spatial relation reasoning for 3d object grounding. Advances in neural information processing systems, Vol. 35 (2022), 20522-20535."},{"key":"e_1_3_2_1_13_1","volume-title":"Grounded 3D-LLM with Referent Tokens. arXiv preprint arXiv:2405.10370","author":"Chen Yilun","year":"2024","unstructured":"Yilun Chen, Shuai Yang, Haifeng Huang, Tai Wang, Ruiyuan Lyu, Runsen Xu, Dahua Lin, and Jiangmiao Pang. 2024b. Grounded 3D-LLM with Referent Tokens. arXiv preprint arXiv:2405.10370 (2024)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00321"},{"key":"e_1_3_2_1_15_1","volume-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna. lmsys.org (accessed","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E Gonzalez, et al., 2023. Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna. lmsys.org (accessed 14 April 2023), Vol. 2, 3 (2023), 6."},{"key":"e_1_3_2_1_16_1","volume-title":"The Thirteenth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=GThTiuXgDC","author":"Chu Hengshuo","year":"2025","unstructured":"Hengshuo Chu, Xiang Deng, Qi Lv, Xiaoyang Chen, Yinchuan Li, Jianye HAO, and Liqiang Nie. 2025. 3D-AffordanceLLM: Harnessing Large Language Models for Open-Vocabulary Affordance Detection in 3D Worlds. In The Thirteenth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=GThTiuXgDC"},{"key":"e_1_3_2_1_17_1","volume-title":"Mobilevlm: A fast, strong and open vision language assistant for mobile devices. arXiv preprint arXiv:2312.16886","author":"Chu Xiangxiang","year":"2023","unstructured":"Xiangxiang Chu, Limeng Qiao, Xinyang Lin, Shuang Xu, Yang Yang, Yiming Hu, Fei Wei, Xinyu Zhang, Bo Zhang, Xiaolin Wei, et al., 2023. Mobilevlm: A fast, strong and open vision language assistant for mobile devices. arXiv preprint arXiv:2312.16886 (2023)."},{"key":"e_1_3_2_1_18_1","unstructured":"Xiangxiang Chu Limeng Qiao Xinyu Zhang Shuang Xu Fei Wei Yang Yang Xiaofei Sun Yiming Hu Xinyang Lin Bo Zhang et al. 2024. Mobilevlm v2: Faster and stronger baseline for vision language model. arXiv preprint arXiv:2402.03766 (2024)."},{"key":"e_1_3_2_1_19_1","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171-4186."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20083-0_24"},{"key":"e_1_3_2_1_21_1","volume-title":"Scene-llm: Extending language model for 3d visual understanding and reasoning. arXiv preprint arXiv:2403.11401","author":"Fu Rao","year":"2024","unstructured":"Rao Fu, Jingyu Liu, Xilun Chen, Yixin Nie, and Wenhan Xiong. 2024. Scene-llm: Extending language model for 3d visual understanding and reasoning. arXiv preprint arXiv:2403.11401 (2024)."},{"key":"e_1_3_2_1_22_1","unstructured":"Ziyu Guo Renrui Zhang Xiangyang Zhu Yiwen Tang Xianzheng Ma Jiaming Han Kexin Chen Peng Gao Xianzhi Li Hongsheng Li et al. 2023. Point-bind & point-llm: Aligning point cloud with multi-modality for 3d understanding generation and instruction following. arXiv preprint arXiv:2309.00615 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"Imagebind-llm: Multi-modality instruction tuning. arXiv preprint arXiv:2309.03905","author":"Han Jiaming","year":"2023","unstructured":"Jiaming Han, Renrui Zhang, Wenqi Shao, Peng Gao, Peng Xu, Han Xiao, Kaipeng Zhang, Chris Liu, Song Wen, Ziyu Guo, et al., 2023. Imagebind-llm: Multi-modality instruction tuning. arXiv preprint arXiv:2309.03905 (2023)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3117837"},{"key":"e_1_3_2_1_25_1","volume-title":"Rethinking Token Reduction in MLLMs: Towards a Unified Paradigm for Training-Free Acceleration. arXiv preprint arXiv:2411.17686","author":"Han Yuhang","year":"2024","unstructured":"Yuhang Han, Xuyang Liu, Pengxiang Ding, Donglin Wang, Honggang Chen, Qingsen Yan, and Siteng Huang. 2024a. Rethinking Token Reduction in MLLMs: Towards a Unified Paradigm for Training-Free Acceleration. arXiv preprint arXiv:2411.17686 (2024)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3393530"},{"key":"e_1_3_2_1_27_1","volume-title":"European Conference on Computer Vision. Springer, 349-367","author":"He Shuting","year":"2024","unstructured":"Shuting He, Henghui Ding, Xudong Jiang, and Bihan Wen. 2024. Segpoint: Segment any point cloud via large language model. In European Conference on Computer Vision. Springer, 349-367."},{"key":"e_1_3_2_1_28_1","first-page":"20482","article-title":"3d-llm: Injecting the 3d world into large language models","volume":"36","author":"Hong Yining","year":"2023","unstructured":"Yining Hong, Haoyu Zhen, Peihao Chen, Shuhong Zheng, Yilun Du, Zhenfang Chen, and Chuang Gan. 2023. 3d-llm: Injecting the 3d world into large language models. Advances in Neural Information Processing Systems, Vol. 36 (2023), 20482-20494.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_29_1","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems.","author":"Huang Haifeng","year":"2024","unstructured":"Haifeng Huang, Yilun Chen, Zehan Wang, Rongjie Huang, Runsen Xu, Tai Wang, Luping Liu, Xize Cheng, Yang Zhao, Jiangmiao Pang, et al., 2024a. Chat-scene: Bridging 3d scene and large language models with object identifiers. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_30_1","volume-title":"Chat-3d v2: Bridging 3d scene and large language models with object identifiers. arXiv preprint arXiv:2312.08168","author":"Huang Haifeng","year":"2023","unstructured":"Haifeng Huang, Zehan Wang, Rongjie Huang, Luping Liu, Xize Cheng, Yang Zhao, Tao Jin, and Zhou Zhao. 2023b. Chat-3d v2: Bridging 3d scene and large language models with object identifiers. arXiv preprint arXiv:2312.08168 (2023)."},{"key":"e_1_3_2_1_31_1","volume-title":"An embodied generalist agent in 3d world. arXiv preprint arXiv:2311.12871","author":"Huang Jiangyong","year":"2023","unstructured":"Jiangyong Huang, Silong Yong, Xiaojian Ma, Xiongkun Linghu, Puhao Li, Yan Wang, Qing Li, Song-Chun Zhu, Baoxiong Jia, and Siyuan Huang. 2023c. An embodied generalist agent in 3d world. arXiv preprint arXiv:2311.12871 (2023)."},{"key":"e_1_3_2_1_32_1","volume-title":"IVTP: Instruction-Guided Visual Token Pruning for Large Vision-Language Models. In European Conference on Computer Vision. Springer, 214-230","author":"Huang Kai","year":"2024","unstructured":"Kai Huang, Hao Zou, Ye Xi, BoChen Wang, Zhen Xie, and Liang Yu. 2024c. IVTP: Instruction-Guided Visual Token Pruning for Large Vision-Language Models. In European Conference on Computer Vision. Springer, 214-230."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611902"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680758"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i4.32427"},{"key":"e_1_3_2_1_36_1","volume-title":"European Conference on Computer Vision. Springer, 436-455","author":"Ju Chen","year":"2024","unstructured":"Chen Ju, Haicheng Wang, Haozhe Cheng, Xu Chen, Zhonghua Zhai, Weilin Huang, Jinsong Lan, Shuai Xiao, and Bo Zheng. 2024. Turbo: Informativity-driven acceleration plug-in for vision-language large models. In European Conference on Computer Vision. Springer, 436-455."},{"key":"e_1_3_2_1_37_1","volume-title":"Robin3d: Improving 3d large language model via robust instruction tuning. arXiv preprint arXiv:2410.00255","author":"Kang Weitai","year":"2024","unstructured":"Weitai Kang, Haifeng Huang, Yuzhang Shang, Mubarak Shah, and Yan Yan. 2024. Robin3d: Improving 3d large language model via robust instruction tuning. arXiv preprint arXiv:2410.00255 (2024)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"e_1_3_2_1_39_1","volume-title":"Beyond Token Compression: A Training-Free Reduction Framework for Efficient Visual Processing in MLLMs. arXiv preprint arXiv:2501.19036","author":"Li Hongliang","year":"2025","unstructured":"Hongliang Li, Jiaxin Zhang, Wenhui Liao, Dezhi Peng, Kai Ding, and Lianwen Jin. 2025. Beyond Token Compression: A Training-Free Reduction Framework for Efficient Visual Processing in MLLMs. arXiv preprint arXiv:2501.19036 (2025)."},{"key":"e_1_3_2_1_40_1","volume-title":"Tokenpacker: Efficient visual projector for multimodal llm. arXiv preprint arXiv:2407.02392","author":"Li Wentong","year":"2024","unstructured":"Wentong Li, Yuqian Yuan, Jian Liu, Dongqi Tang, Song Wang, Jie Qin, Jianke Zhu, and Lei Zhang. 2024b. Tokenpacker: Efficient visual projector for multimodal llm. arXiv preprint arXiv:2407.02392 (2024)."},{"key":"e_1_3_2_1_41_1","volume-title":"European Conference on Computer Vision. Springer, 323-340","author":"Li Yanwei","year":"2024","unstructured":"Yanwei Li, Chengyao Wang, and Jiaya Jia. 2024a. Llama-vid: An image is worth 2 tokens in large language models. In European Conference on Computer Vision. Springer, 323-340."},{"key":"e_1_3_2_1_42_1","volume-title":"Not all patches are what you need: Expediting vision transformers via token reorganizations. arXiv preprint arXiv:2202.07800","author":"Liang Youwei","year":"2022","unstructured":"Youwei Liang, Chongjian Ge, Zhan Tong, Yibing Song, Jue Wang, and Pengtao Xie. 2022. Not all patches are what you need: Expediting vision transformers via token reorganizations. arXiv preprint arXiv:2202.07800 (2022)."},{"key":"e_1_3_2_1_43_1","volume-title":"Boosting Multimodal Large Language Models with Visual Tokens Withdrawal for Rapid Inference. arXiv preprint arXiv:2405.05803","author":"Lin Zhihang","year":"2024","unstructured":"Zhihang Lin, Mingbao Lin, Luxi Lin, and Rongrong Ji. 2024. Boosting Multimodal Large Language Models with Visual Tokens Withdrawal for Rapid Inference. arXiv preprint arXiv:2405.05803 (2024)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01110"},{"key":"e_1_3_2_1_45_1","volume-title":"A survey on text-guided 3D visual grounding: elements, recent advances, and future directions. arXiv preprint arXiv:2406.05785","author":"Liu Daizong","year":"2024","unstructured":"Daizong Liu, Yang Liu, Wencan Huang, and Wei Hu. 2024b. A survey on text-guided 3D visual grounding: elements, recent advances, and future directions. arXiv preprint arXiv:2406.05785 (2024)."},{"key":"e_1_3_2_1_46_1","volume-title":"A survey of attacks on large vision-language models: Resources, advances, and future trends. arXiv preprint arXiv:2407.07403","author":"Liu Daizong","year":"2024","unstructured":"Daizong Liu, Mingyu Yang, Xiaoye Qu, Pan Zhou, Yu Cheng, and Wei Hu. 2024c. A survey of attacks on large vision-language models: Resources, advances, and future trends. arXiv preprint arXiv:2407.07403 (2024)."},{"key":"e_1_3_2_1_47_1","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems.","author":"Liu Daizong","year":"2024","unstructured":"Daizong Liu, Mingyu Yang, Xiaoye Qu, Pan Zhou, Xiang Fang, Keke Tang, Yao Wan, and Lichao Sun. 2024d. Pandora's Box: Towards Building Universal Attackers against Real-World Large Vision-Language Models. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_48_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024a. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_49_1","unstructured":"Ilya Loshchilov Frank Hutter et al. 2017. Fixing weight decay regularization in adam. arXiv preprint arXiv:1711.05101 Vol. 5 (2017) 5."},{"key":"e_1_3_2_1_50_1","volume-title":"Sqa3d: Situated question answering in 3d scenes. arXiv preprint arXiv:2210.07474","author":"Ma Xiaojian","year":"2022","unstructured":"Xiaojian Ma, Silong Yong, Zilong Zheng, Qing Li, Yitao Liang, Song-Chun Zhu, and Siyuan Huang. 2022. Sqa3d: Situated question answering in 3d scenes. arXiv preprint arXiv:2210.07474 (2022)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01199"},{"key":"e_1_3_2_1_52_1","volume-title":"DeepStack: Deeply Stacking Visual Tokens is Surprisingly Simple and Effective for LMMs. arXiv preprint arXiv:2406.04334","author":"Meng Lingchen","year":"2024","unstructured":"Lingchen Meng, Jianwei Yang, Rui Tian, Xiyang Dai, Zuxuan Wu, Jianfeng Gao, and Yu-Gang Jiang. 2024. DeepStack: Deeply Stacking Visual Tokens is Surprisingly Simple and Effective for LMMs. arXiv preprint arXiv:2406.04334 (2024)."},{"key":"e_1_3_2_1_53_1","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni Huy Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby et al. 2023. Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)."},{"key":"e_1_3_2_1_54_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318."},{"key":"e_1_3_2_1_55_1","volume-title":"European Conference on Computer Vision. Springer, 214-238","author":"Qi Zekun","year":"2024","unstructured":"Zekun Qi, Runpei Dong, Shaochen Zhang, Haoran Geng, Chunrui Han, Zheng Ge, Li Yi, and Kaisheng Ma. 2024a. Shapellm: Universal 3d object understanding for embodied interaction. In European Conference on Computer Vision. Springer, 214-238."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02495"},{"key":"e_1_3_2_1_57_1","volume-title":"GPT4Scene: Understand 3D Scenes from Videos with Vision-Language Models. arXiv preprint arXiv:2501.01428","author":"Qi Zhangyang","year":"2025","unstructured":"Zhangyang Qi, Zhixiong Zhang, Ye Fang, Jiaqi Wang, and Hengshuang Zhao. 2025. GPT4Scene: Understand 3D Scenes from Videos with Vision-Language Models. arXiv preprint arXiv:2501.01428 (2025)."},{"key":"e_1_3_2_1_58_1","volume-title":"Dynamicvit: Efficient vision transformers with dynamic token sparsification. Advances in neural information processing systems","author":"Rao Yongming","year":"2021","unstructured":"Yongming Rao, Wenliang Zhao, Benlin Liu, Jiwen Lu, Jie Zhou, and Cho-Jui Hsieh. 2021. Dynamicvit: Efficient vision transformers with dynamic token sparsification. Advances in neural information processing systems, Vol. 34 (2021), 13937-13949."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160590"},{"key":"e_1_3_2_1_60_1","volume-title":"Yong Jae Lee, and Yan Yan","author":"Shang Yuzhang","year":"2024","unstructured":"Yuzhang Shang, Mu Cai, Bingxin Xu, Yong Jae Lee, and Yan Yan. 2024. Llava-prumerge: Adaptive token reduction for efficient large multimodal models. arXiv preprint arXiv:2403.15388 (2024)."},{"key":"e_1_3_2_1_61_1","volume-title":"Crossget: Cross-guided ensemble of tokens for accelerating vision-language transformers. arXiv preprint arXiv:2305.17455","author":"Shi Dachuan","year":"2023","unstructured":"Dachuan Shi, Chaofan Tao, Anyi Rao, Zhendong Yang, Chun Yuan, and Jiaqi Wang. 2023. Crossget: Cross-guided ensemble of tokens for accelerating vision-language transformers. arXiv preprint arXiv:2305.17455 (2023)."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681257"},{"key":"e_1_3_2_1_63_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al., 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_64_1","volume-title":"Attention is all you need. Advances in Neural Information Processing Systems","author":"Vaswani A","year":"2017","unstructured":"A Vaswani. 2017. Attention is all you need. Advances in Neural Information Processing Systems (2017)."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"volume-title":"Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality. https:\/\/vicuna.lmsys.org\/.","year":"2023","key":"e_1_3_2_1_66_1","unstructured":"Vicuna. 2023. Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality. https:\/\/vicuna.lmsys.org\/."},{"key":"e_1_3_2_1_67_1","volume-title":"arXiv preprint arXiv:2412.05819","author":"Wang Ao","year":"2024","unstructured":"Ao Wang, Fengyuan Sun, Hui Chen, Zijia Lin, Jungong Han, and Guiguang Ding. 2024b. [CLS] Token Tells Everything Needed for Training-free Efficient MLLMs. arXiv preprint arXiv:2412.05819 (2024)."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01521"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-naacl.18"},{"key":"e_1_3_2_1_70_1","volume-title":"Chat-3d: Data-efficiently tuning large language model for universal dialogue of 3d scenes. arXiv preprint arXiv:2308.08769","author":"Wang Zehan","year":"2023","unstructured":"Zehan Wang, Haifeng Huang, Yang Zhao, Ziang Zhang, and Zhou Zhao. 2023. Chat-3d: Data-efficiently tuning large language model for universal dialogue of 3d scenes. arXiv preprint arXiv:2308.08769 (2023)."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681464"},{"key":"e_1_3_2_1_72_1","volume-title":"European Conference on Computer Vision. Springer, 131-147","author":"Xu Runsen","year":"2024","unstructured":"Runsen Xu, Xiaolong Wang, Tai Wang, Yilun Chen, Jiangmiao Pang, and Dahua Lin. 2024. Pointllm: Empowering large language models to understand point clouds. In European Conference on Computer Vision. Springer, 131-147."},{"key":"e_1_3_2_1_73_1","volume-title":"DeCo: Decoupling Token Compression from Semantic Abstraction in Multimodal Large Language Models. arXiv preprint arXiv:2405.20985","author":"Yao Linli","year":"2024","unstructured":"Linli Yao, Lei Li, Shuhuai Ren, Lean Wang, Yuanxin Liu, Xu Sun, and Lu Hou. 2024. DeCo: Decoupling Token Compression from Semantic Abstraction in Multimodal Large Language Models. arXiv preprint arXiv:2405.20985 (2024)."},{"key":"e_1_3_2_1_74_1","volume-title":"Fit and prune: Fast and training-free visual token pruning for multi-modal large language models. arXiv preprint arXiv:2409.10197","author":"Ye Weihao","year":"2024","unstructured":"Weihao Ye, Qiong Wu, Wenhao Lin, and Yiyi Zhou. 2024b. Fit and prune: Fast and training-free visual token pruning for multi-modal large language models. arXiv preprint arXiv:2409.10197 (2024)."},{"key":"e_1_3_2_1_75_1","volume-title":"VoCo-LLaMA: Towards Vision Compression with Large Language Models. arXiv preprint arXiv:2406.12275","author":"Ye Xubing","year":"2024","unstructured":"Xubing Ye, Yukang Gan, Xiaoke Huang, Yixiao Ge, Ying Shan, and Yansong Tang. 2024a. VoCo-LLaMA: Towards Vision Compression with Large Language Models. arXiv preprint arXiv:2406.12275 (2024)."},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01320"},{"key":"e_1_3_2_1_77_1","volume-title":"3DGraphLLM: Combining Semantic Graphs and Large Language Models for 3D Scene Understanding. arXiv preprint arXiv:2412.18450","author":"Zemskova Tatiana","year":"2024","unstructured":"Tatiana Zemskova and Dmitry Yudin. 2024. 3DGraphLLM: Combining Semantic Graphs and Large Language Models for 3D Scene Understanding. arXiv preprint arXiv:2412.18450 (2024)."},{"key":"e_1_3_2_1_78_1","volume-title":"Sparsevlm: Visual token sparsification for efficient vision-language model inference. arXiv preprint arXiv:2410.04417","author":"Zhang Yuan","year":"2024","unstructured":"Yuan Zhang, Chun-Kai Fan, Junpeng Ma, Wenzhao Zheng, Tao Huang, Kuan Cheng, Denis Gudovskiy, Tomoyuki Okuno, Yohei Nakata, Kurt Keutzer, et al., 2024. Sparsevlm: Visual token sparsification for efficient vision-language model inference. arXiv preprint arXiv:2410.04417 (2024)."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01397"},{"key":"e_1_3_2_1_80_1","volume-title":"Accelerating Multimodel Large Language Models by Searching Optimal Vision Token Reduction. arXiv preprint arXiv:2412.00556","author":"Zhao Shiyu","year":"2024","unstructured":"Shiyu Zhao, Zhenting Wang, Felix Juefei-Xu, Xide Xia, Miao Liu, Xiaofang Wang, Mingfu Liang, Ning Zhang, Dimitris N Metaxas, and Licheng Yu. 2024d. Accelerating Multimodel Large Language Models by Searching Optimal Vision Token Reduction. arXiv preprint arXiv:2412.00556 (2024)."},{"key":"e_1_3_2_1_81_1","volume-title":"A Stitch in Time Saves Nine: Small VLM is a Precise Guidance for accelerating Large VLMs. arXiv preprint arXiv:2412.03324","author":"Zhao Wangbo","year":"2024","unstructured":"Wangbo Zhao, Yizeng Han, Jiasheng Tang, Zhikai Li, Yibing Song, Kai Wang, Zhangyang Wang, and Yang You. 2024a. A Stitch in Time Saves Nine: Small VLM is a Precise Guidance for accelerating Large VLMs. arXiv preprint arXiv:2412.03324 (2024)."},{"key":"e_1_3_2_1_82_1","volume-title":"Dynamic diffusion transformer. arXiv preprint arXiv:2410.03456","author":"Zhao Wangbo","year":"2024","unstructured":"Wangbo Zhao, Yizeng Han, Jiasheng Tang, Kai Wang, Yibing Song, Gao Huang, Fan Wang, and Yang You. 2024b. Dynamic diffusion transformer. arXiv preprint arXiv:2410.03456 (2024)."},{"key":"e_1_3_2_1_83_1","volume-title":"Dynamic tuning towards parameter and inference efficiency for vit adaptation. arXiv preprint arXiv:2403.11808","author":"Zhao Wangbo","year":"2024","unstructured":"Wangbo Zhao, Jiasheng Tang, Yizeng Han, Yibing Song, Kai Wang, Gao Huang, Fan Wang, and Yang You. 2024c. Dynamic tuning towards parameter and inference efficiency for vit adaptation. arXiv preprint arXiv:2403.11808 (2024)."},{"key":"e_1_3_2_1_84_1","volume-title":"Aim: Adaptive inference of multi-modal llms via token merging and pruning. arXiv preprint arXiv:2412.03248","author":"Zhong Yiwu","year":"2024","unstructured":"Yiwu Zhong, Zhuoming Liu, Yin Li, and Liwei Wang. 2024. Aim: Adaptive inference of multi-modal llms via token merging and pruning. arXiv preprint arXiv:2412.03248 (2024)."},{"key":"e_1_3_2_1_85_1","volume-title":"Uni3d: Exploring unified 3d representation at scale. arXiv preprint arXiv:2310.06773","author":"Zhou Junsheng","year":"2023","unstructured":"Junsheng Zhou, Jinsheng Wang, Baorui Ma, Yu-Shen Liu, Tiejun Huang, and Xinlong Wang. 2023. Uni3d: Exploring unified 3d representation at scale. arXiv preprint arXiv:2310.06773 (2023)."},{"key":"e_1_3_2_1_86_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."},{"key":"e_1_3_2_1_87_1","volume-title":"Accelerating Multimodal Large Language Model by Spatial-Temporal Visual Token Trimming. arXiv preprint arXiv:2412.20105","author":"Zhuang Jiedong","year":"2024","unstructured":"Jiedong Zhuang, Lu Lu, Ming Dai, Rui Hu, Jian Chen, Qiang Liu, and Haoji Hu. 2024. ST^3: Accelerating Multimodal Large Language Model by Spatial-Temporal Visual Token Trimming. arXiv preprint arXiv:2412.20105 (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755119","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:07:06Z","timestamp":1765310826000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755119"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":87,"alternative-id":["10.1145\/3746027.3755119","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755119","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}