{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:09:17Z","timestamp":1765339757081,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754503","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:56:43Z","timestamp":1761371803000},"page":"7172-7180","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["TF-ATM: Training-Free Adaptive Token Merging"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-1868-3652","authenticated-orcid":false,"given":"Xin","family":"Zhang","sequence":"first","affiliation":[{"name":"Xidian University, Xi'an, Shaanxi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8310-024X","authenticated-orcid":false,"given":"Weiying","family":"Xie","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, Shaanxi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0234-6270","authenticated-orcid":false,"given":"Yunsong","family":"Li","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, Shaanxi, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6904-1219","authenticated-orcid":false,"given":"Xiaoyu","family":"Chen","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, Shaanxi, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5178-9309","authenticated-orcid":false,"given":"Tianlin","family":"Hui","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, Shaanxi, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7782-8184","authenticated-orcid":false,"given":"Jitao","family":"Ma","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, Shaanxi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2351-4461","authenticated-orcid":false,"given":"Leyuan","family":"Fang","sequence":"additional","affiliation":[{"name":"Hunan University, Changsha, Hunan, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"International Conference on Learning Representations.","author":"Bolya Daniel","year":"2023","unstructured":"Daniel Bolya, Cheng-Yang Fu, Xiaoliang Dai, Peizhao Zhang, Christoph Feichtenhofer, and Judy Hoffman. 2023. Token merging: Your ViT but faster. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01574"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20080-9_41"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_5_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20083-0_24"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3293763"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2008.4587747"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680645"},{"key":"e_1_3_2_1_10_1","volume-title":"International Conference on Machine Learning. PMLR, 3690-3699","author":"Goyal Saurabh","year":"2020","unstructured":"Saurabh Goyal, Anamitra Roy Choudhury, Saurabh Raje, Venkatesan Chakaravarthy, Yogish Sabharwal, and Ashish Verma. 2020. Power-bert: Accelerating bert inference via progressive word-vector elimination. In International Conference on Machine Learning. PMLR, 3690-3699."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.213"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20083-0_37"},{"key":"e_1_3_2_1_14_1","volume-title":"Learning to merge tokens via decoupled embedding for efficient vision transformers. arXiv preprint arXiv:2412.10569","author":"Lee Dong Hoon","year":"2024","unstructured":"Dong Hoon Lee and Seunghoon Hong. 2024. Learning to merge tokens via decoupled embedding for efficient vision transformers. arXiv preprint arXiv:2412.10569 (2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"Dual-Depth Unified Joint Optimization: Adaptive Curvature-Based Compression","author":"Li Yunsong","year":"2025","unstructured":"Yunsong Li, Xin Zhang, Weiying Xie, Xiaoyu Chen, Daixun Li, Hangyu Ye, and Leyuan Fang. 2025. Dual-Depth Unified Joint Optimization: Adaptive Curvature-Based Compression. IEEE Transactions on Circuits and Systems for Video Technology (2025)."},{"key":"e_1_3_2_1_16_1","volume-title":"Not all patches are what you need: Expediting vision transformers via token reorganizations. arXiv preprint arXiv:2202.07800","author":"Liang Youwei","year":"2022","unstructured":"Youwei Liang, Chongjian Ge, Zhan Tong, Yibing Song, Jue Wang, and Pengtao Xie. 2022. Not all patches are what you need: Expediting vision transformers via token reorganizations. arXiv preprint arXiv:2202.07800 (2022)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00996"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02263"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.3390\/app13095521"},{"key":"e_1_3_2_1_21_1","first-page":"154","article-title":"Interpretability-aware redundancy reduction for vision transformers","volume":"12","author":"Pan Bowen","year":"2024","unstructured":"Bowen Pan, Rameswar Panda, Rogerio Schmidt Feris, and Aude Jeanne Oliva. 2024. Interpretability-aware redundancy reduction for vision transformers. US Patent 12,154,307.","journal-title":"US Patent"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00043"},{"key":"e_1_3_2_1_23_1","first-page":"13937","article-title":"Dynamicvit: Efficient vision transformers with dynamic token sparsification","volume":"34","author":"Rao Yongming","year":"2021","unstructured":"Yongming Rao, Wenliang Zhao, Benlin Liu, Jiwen Lu, Jie Zhou, and Cho-Jui Hsieh. 2021. Dynamicvit: Efficient vision transformers with dynamic token sparsification. In Advances in Neural Information Processing Systems, Vol. 34. 13937-13949.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00302"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01185"},{"key":"e_1_3_2_1_26_1","first-page":"10936","article-title":"Scop: Scientific control for reliable neural network pruning","volume":"33","author":"Tang Yehui","year":"2020","unstructured":"Yehui Tang, Yunhe Wang, Yixing Xu, Dacheng Tao, Chunjing Xu, Chao Xu, and Chang Xu. 2020. Scop: Scientific control for reliable neural network pruning. In Advances in Neural Information Processing Systems, Vol. 33. 10936-10947.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_27_1","volume-title":"International Conference on Machine Learning. PMLR, 10347-10357","author":"Touvron Hugo","year":"2021","unstructured":"Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, and Herv\u00e9 J\u00e9gou. 2021. Training data-efficient image transformers & distillation through attention. In International Conference on Machine Learning. PMLR, 10347-10357."},{"key":"e_1_3_2_1_28_1","first-page":"30772","article-title":"Accelerating transformers with spectrum-preserving token merging","volume":"37","author":"Tran Chau","year":"2024","unstructured":"Chau Tran, Duy MH Nguyen, Manh-Duy Nguyen, TrungTin Nguyen, Ngan Le, Pengtao Xie, Daniel Sonntag, James Y Zou, Binh Nguyen, and Mathias Niepert. 2024. Accelerating transformers with spectrum-preserving token merging. In Advances in Neural Information Processing Systems, Vol. 37. 30772-30810.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612303"},{"key":"e_1_3_2_1_30_1","volume-title":"Anti-oversmoothing in deep vision transformers via the fourier domain analysis: From theory to practice. arXiv preprint arXiv:2203.05962","author":"Wang Peihao","year":"2022","unstructured":"Peihao Wang, Wenqing Zheng, Tianlong Chen, and Zhangyang Wang. 2022b. Anti-oversmoothing in deep vision transformers via the fourier domain analysis: From theory to practice. arXiv preprint arXiv:2203.05962 (2022)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680983"},{"key":"e_1_3_2_1_32_1","first-page":"13974","article-title":"Vtc-lfc: Vision transformer compression with low-frequency components","volume":"35","author":"Wang Zhenyu","year":"2022","unstructured":"Zhenyu Wang, Hao Luo, Pichao Wang, Feng Ding, Fan Wang, and Hao Li. 2022a. Vtc-lfc: Vision transformer compression with low-frequency components. In Advances in Neural Information Processing Systems, Vol. 35. 13974-13988.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00208"},{"key":"e_1_3_2_1_34_1","first-page":"30392","article-title":"Early convolutions help transformers see better","volume":"34","author":"Xiao Tete","year":"2021","unstructured":"Tete Xiao, Mannat Singh, Eric Mintun, Trevor Darrell, Piotr Doll\u00e1r, and Ross Girshick. 2021. Early convolutions help transformers see better. In Advances in Neural Information Processing Systems, Vol. 34. 30392-30400.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20202"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680906"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01054"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680872"},{"key":"e_1_3_2_1_39_1","volume-title":"International Conference on Learning Representations.","author":"Zhang Yuyao","year":"2024","unstructured":"Yuyao Zhang, Lan Wei, and Nikolaos Freris. 2024. Synergistic patch pruning for vision transformer: unifying intra-& inter-layer patch importance. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_40_1","first-page":"9010","article-title":"Savit: Structure-aware vision transformer pruning via collaborative optimization","volume":"35","author":"Zheng Chuanyang","year":"2022","unstructured":"Chuanyang Zheng, Kai Zhang, Zhi Yang, Wenming Tan, Jun Xiao, Ye Ren, Shiliang Pu, et al., 2022. Savit: Structure-aware vision transformer pruning via collaborative optimization. In Advances in Neural Information Processing Systems, Vol. 35. 9010-9023.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_41_1","volume-title":"Deepvit: Towards deeper vision transformer. arXiv preprint arXiv:2103.11886","author":"Zhou Daquan","year":"2021","unstructured":"Daquan Zhou, Bingyi Kang, Xiaojie Jin, Linjie Yang, Xiaochen Lian, Zihang Jiang, Qibin Hou, and Jiashi Feng. 2021. Deepvit: Towards deeper vision transformer. arXiv preprint arXiv:2103.11886 (2021)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754503","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:04:53Z","timestamp":1765339493000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754503"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":41,"alternative-id":["10.1145\/3746027.3754503","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754503","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}