{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T16:48:43Z","timestamp":1755794923307,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,3]]},"DOI":"10.1145\/3711896.3737187","type":"proceedings-article","created":{"date-parts":[[2025,8,3]],"date-time":"2025-08-03T21:05:41Z","timestamp":1754255141000},"page":"5237-5248","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["ADMIRE: ADaptive method to enhance Multiple Image REsolutions in text-rich multi-image understanding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4189-9636","authenticated-orcid":false,"given":"Qipeng","family":"Zhu","sequence":"first","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5835-2256","authenticated-orcid":false,"given":"Xiong","family":"Wang","sequence":"additional","affiliation":[{"name":"Ant Group, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5773-9065","authenticated-orcid":false,"given":"Zhihong","family":"Lu","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-7519-7899","authenticated-orcid":false,"given":"Jiangwei","family":"Lao","sequence":"additional","affiliation":[{"name":"Ant Group, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1180-5513","authenticated-orcid":false,"given":"Congyun","family":"Jin","sequence":"additional","affiliation":[{"name":"Ant Group, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5625-5729","authenticated-orcid":false,"given":"Jie","family":"Chen","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6077-6704","authenticated-orcid":false,"given":"Yingzhe","family":"Peng","sequence":"additional","affiliation":[{"name":"Southeast University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1545-1854","authenticated-orcid":false,"given":"Qi","family":"Zhu","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8426-5289","authenticated-orcid":false,"given":"Lianzhen","family":"Zhong","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3020-2677","authenticated-orcid":false,"given":"Jiajia","family":"Liu","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1220-4527","authenticated-orcid":false,"given":"Peng","family":"Wei","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4144-1753","authenticated-orcid":false,"given":"Jian","family":"Wang","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2025,8,3]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i2.27828"},{"key":"e_1_3_2_2_2_1","unstructured":"Daniel Bolya Cheng-Yang Fu Xiaoliang Dai Peizhao Zhang Christoph Feichtenhofer and Judy Hoffman. 2022. Token merging: Your vit but faster. arXiv preprint arXiv:2210.09461(2022)."},{"key":"e_1_3_2_2_3_1","first-page":"15710","volume-title":"MADTP: Multimodal Alignment-Guided Dynamic Token Pruning for Accelerating Vision-Language Transformer. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2024","author":"Cao Jianjian","year":"2024","unstructured":"Jianjian Cao, Peng Ye, Shengze Li, Chong Yu, Yansong Tang, Jiwen Lu, and Tao Chen. 2024. MADTP: Multimodal Alignment-Guided Dynamic Token Pruning for Accelerating Vision-Language Transformer. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2024, Seattle, WA, USA, June 16-22, 2024. 15710-15719."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73004-7_2"},{"key":"e_1_3_2_2_5_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24185-24198","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, et al., 2024. Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24185-24198."},{"key":"e_1_3_2_2_6_1","unstructured":"Xiaoyi Dong Pan Zhang Yuhang Zang Yuhang Cao Bin Wang Linke Ouyang Songyang Zhang Haodong Duan Wenwei Zhang Yining Li et al. 2024. Internlm-xcomposer2-4khd: A pioneering large vision-language model handling resolutions from 336 pixels to 4k hd. arXiv preprint arXiv:2404.06512(2024)."},{"key":"e_1_3_2_2_7_1","unstructured":"Anwen Hu Haiyang Xu Liang Zhang Jiabo Ye Ming Yan Ji Zhang Qin Jin Fei Huang and Jingren Zhou. 2024. mplug-docowl2: High-resolution compressing for ocr-free multi-page document understanding. arXiv preprint arXiv:2409.03420(2024)."},{"key":"e_1_3_2_2_8_1","volume-title":"Mini-monkey: Alleviate the sawtooth effect by multi-scale adaptive cropping. arXiv e-prints(2024), arXiv-2408.","author":"Huang Mingxin","year":"2024","unstructured":"Mingxin Huang, Yuliang Liu, Dingkang Liang, Lianwen Jin, and Xiang Bai. 2024b. Mini-monkey: Alleviate the sawtooth effect by multi-scale adaptive cropping. arXiv e-prints(2024), arXiv-2408."},{"key":"e_1_3_2_2_9_1","volume-title":"Hires-llava: Restoring fragmentation input in high-resolution large vision-language models. arXiv preprint arXiv:2407.08706(2024).","author":"Huang Runhui","year":"2024","unstructured":"Runhui Huang, Xinpeng Ding, Chunwei Wang, Jianhua Han, Yulong Liu, Hengshuang Zhao, Hang Xu, Lu Hou, Wei Zhang, and Xiaodan Liang. 2024a. Hires-llava: Restoring fragmentation input in high-resolution large vision-language models. arXiv preprint arXiv:2407.08706(2024)."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548112"},{"key":"e_1_3_2_2_11_1","first-page":"16206","volume-title":"Resource- Efficient Transformer Pruning for Finetuning of Large Models. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2024","author":"Ilhan Fatih","year":"2024","unstructured":"Fatih Ilhan, Gong Su, Selim Furkan Tekin, Tiansheng Huang, Sihao Hu, and Ling Liu. 2024. Resource- Efficient Transformer Pruning for Finetuning of Large Models. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2024, Seattle, WA, USA, June 16-22, 2024. 16206-16215."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00442"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00141"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2408.12637"},{"key":"e_1_3_2_2_15_1","volume-title":"Llava-next-interleave: Tackling multi-image, video, and 3d in large multimodal models. arXiv preprint arXiv:2407.07895(2024).","author":"Li Feng","year":"2024","unstructured":"Feng Li, Renrui Zhang, Hao Zhang, Yuanhan Zhang, Bo Li, Wei Li, Zejun Ma, and Chunyuan Li. 2024c. Llava-next-interleave: Tackling multi-image, video, and 3d in large multimodal models. arXiv preprint arXiv:2407.07895(2024)."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01472"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02527"},{"key":"e_1_3_2_2_18_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453476"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29874"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i11.26598"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01789"},{"key":"e_1_3_2_2_23_1","first-page":"16070","volume-title":"Zero-TPrune: Zero-Shot Token Pruning Through Leveraging of the Attention Graph in Pre-Trained Transformers. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2024","author":"Wang Hongjie","year":"2024","unstructured":"Hongjie Wang, Bhishma Dedhia, and Niraj K. Jha. 2024b. Zero-TPrune: Zero-Shot Token Pruning Through Leveraging of the Attention Graph in Pre-Trained Transformers. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2024, Seattle, WA, USA, June 16-22, 2024. 16070-16079."},{"key":"e_1_3_2_2_24_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024a. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191(2024)."},{"key":"e_1_3_2_2_25_1","volume-title":"Efficientvlm: Fast and accurate vision-language models via knowledge distillation and modal-adaptive pruning. arXiv preprint arXiv:2210.07795(2022).","author":"Wang Tiannan","year":"2022","unstructured":"Tiannan Wang, Wangchunshu Zhou, Yan Zeng, and Xinsong Zhang. 2022. Efficientvlm: Fast and accurate vision-language models via knowledge distillation and modal-adaptive pruning. arXiv preprint arXiv:2210.07795(2022)."},{"key":"e_1_3_2_2_26_1","volume-title":"General OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model. CoRR","author":"Wei Haoran","year":"2024","unstructured":"Haoran Wei, Chenglong Liu, Jinyue Chen, Jia Wang, Lingyu Kong, Yanming Xu, Zheng Ge, Liang Zhao, Jianjian Sun, Yuang Peng, Chunrui Han, and Xiangyu Zhang. 2024. General OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model. CoRR, Vol. abs\/2409.01704 (2024)."},{"key":"e_1_3_2_2_27_1","volume-title":"Hiri-vit: Scaling vision transformer with high resolution inputs","author":"Yao Ting","year":"2024","unstructured":"Ting Yao, Yehao Li, Yingwei Pan, and Tao Mei. 2024. Hiri-vit: Scaling vision transformer with high resolution inputs. IEEE Transactions on Pattern Analysis and Machine Intelligence(2024)."},{"key":"e_1_3_2_2_28_1","first-page":"2841","volume-title":"UReader: Universal OCR-free Visually-situated Language Understanding with Multimodal Large Language Model. In Findings of the Association for Computational Linguistics: EMNLP 2023","author":"Ye Jiabo","year":"2023","unstructured":"Jiabo Ye, Anwen Hu, Haiyang Xu, Qinghao Ye, Ming Yan, Guohai Xu, Chenliang Li, Junfeng Tian, Qi Qian, Ji Zhang, Qin Jin, Liang He, Xin Lin, and Fei Huang. 2023. UReader: Universal OCR-free Visually-situated Language Understanding with Multimodal Large Language Model. In Findings of the Association for Computational Linguistics: EMNLP 2023, Singapore, December 6-10, 2023. 2841-2858."},{"key":"e_1_3_2_2_29_1","volume-title":"mPLUG-Owl3: Towards Long Image-Sequence Understanding in Multi-Modal Large Language Models. CoRR","author":"Ye Jiabo","year":"2024","unstructured":"Jiabo Ye, Haiyang Xu, Haowei Liu, Anwen Hu, Ming Yan, Qi Qian, Ji Zhang, Fei Huang, and Jingren Zhou. 2024. mPLUG-Owl3: Towards Long Image-Sequence Understanding in Multi-Modal Large Language Models. CoRR, Vol. abs\/2408.04840 (2024)."},{"key":"e_1_3_2_2_30_1","unstructured":"Jiarui Zhang Mahyar Khayatkhoei Prateek Chhikara and Filip Ilievski. 2025. MLLMs know where to look: Training-free perception of small visual details with multimodal LLMs. arXiv preprint arXiv:2502.17422(2025)."},{"key":"e_1_3_2_2_31_1","volume-title":"Long Context Transfer from Language to Vision. CoRR","author":"Zhang Peiyuan","year":"2024","unstructured":"Peiyuan Zhang, Kaichen Zhang, Bo Li, Guangtao Zeng, Jingkang Yang, Yuanhan Zhang, Ziyue Wang, Haoran Tan, Chunyuan Li, and Ziwei Liu. 2024c. Long Context Transfer from Language to Vision. CoRR, Vol. abs\/2406.16852 (2024)."},{"key":"e_1_3_2_2_32_1","volume-title":"Sparsevlm: Visual token sparsification for efficient vision-language model inference. arXiv preprint arXiv:2410.04417(2024).","author":"Zhang Yuan","year":"2024","unstructured":"Yuan Zhang, Chun-Kai Fan, Junpeng Ma, Wenzhao Zheng, Tao Huang, Kuan Cheng, Denis Gudovskiy, Tomoyuki Okuno, Yohei Nakata, Kurt Keutzer, et al., 2024a. Sparsevlm: Visual token sparsification for efficient vision-language model inference. arXiv preprint arXiv:2410.04417(2024)."},{"key":"e_1_3_2_2_33_1","volume-title":"Sparsevlm: Visual token sparsification for efficient vision-language model inference. arXiv preprint arXiv:2410.04417(2024).","author":"Zhang Yuan","year":"2024","unstructured":"Yuan Zhang, Chun-Kai Fan, Junpeng Ma, Wenzhao Zheng, Tao Huang, Kuan Cheng, Denis Gudovskiy, Tomoyuki Okuno, Yohei Nakata, Kurt Keutzer, et al., 2024b. Sparsevlm: Visual token sparsification for efficient vision-language model inference. arXiv preprint arXiv:2410.04417(2024)."}],"event":{"name":"KDD '25: The 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"],"location":"Toronto ON Canada","acronym":"KDD '25"},"container-title":["Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3711896.3737187","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,16]],"date-time":"2025-08-16T14:33:06Z","timestamp":1755354786000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3711896.3737187"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,3]]},"references-count":33,"alternative-id":["10.1145\/3711896.3737187","10.1145\/3711896"],"URL":"https:\/\/doi.org\/10.1145\/3711896.3737187","relation":{},"subject":[],"published":{"date-parts":[[2025,8,3]]},"assertion":[{"value":"2025-08-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}