{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T16:57:16Z","timestamp":1777654636686,"version":"3.51.4"},"publisher-location":"Cham","reference-count":44,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031732195","type":"print"},{"value":"9783031732201","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73220-1_19","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T20:03:30Z","timestamp":1730577810000},"page":"325-341","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Removing Rows and\u00a0Columns of\u00a0Tokens in\u00a0Vision Transformer Enables Faster Dense Prediction Without Retraining"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2503-4202","authenticated-orcid":false,"given":"Diwei","family":"Su","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4922-9259","authenticated-orcid":false,"given":"Cheng","family":"Fei","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0890-5256","authenticated-orcid":false,"given":"Jianxu","family":"Luo","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"key":"19_CR1","unstructured":"Bian, Z., Wang, Z., Han, W., Wang, K.: Muti-scale and token mergence: make your vit more efficient. arXiv preprint arXiv:2306.04897 (2023)"},{"key":"19_CR2","unstructured":"Bolya, D., Fu, C.Y., Dai, X., Zhang, P., Feichtenhofer, C., Hoffman, J.: Token merging: your vit but faster. In: The Eleventh International Conference on Learning Representations (2022)"},{"key":"19_CR3","unstructured":"Bonnaerens, M., Dambre, J.: Learned thresholds token merging and pruning for vision transformers. arXiv preprint arXiv:2307.10780 (2023)"},{"key":"19_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"19_CR5","unstructured":"Chen, Z., et al.: Vision transformer adapter for dense predictions. arXiv preprint arXiv:2205.08534 (2022)"},{"key":"19_CR6","unstructured":"Choromanski, K., et\u00a0al.: Rethinking attention with performers. arXiv preprint arXiv:2009.14794 (2020)"},{"key":"19_CR7","doi-asserted-by":"crossref","unstructured":"Cordts, M., et al.: The cityscapes dataset for semantic urban scene understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3213\u20133223 (2016)","DOI":"10.1109\/CVPR.2016.350"},{"key":"19_CR8","doi-asserted-by":"crossref","unstructured":"Deng, J., et al.: Imagenet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"19_CR9","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"19_CR10","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"396","DOI":"10.1007\/978-3-031-20083-0_24","volume-title":"ECCV 2022","author":"M Fayyaz","year":"2022","unstructured":"Fayyaz, M., et al.: Adaptive token sampling for efficient vision transformers. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13671, pp. 396\u2013414. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20083-0_24"},{"key":"19_CR11","unstructured":"Geng, Z., Guo, M.H., Chen, H., Li, X., Wei, K., Lin, Z.: Is attention better than matrix decomposition? arXiv preprint arXiv:2109.04553 (2021)"},{"issue":"5","key":"19_CR12","first-page":"5436","volume":"45","author":"MH Guo","year":"2022","unstructured":"Guo, M.H., Liu, Z.N., Mu, T.J., Hu, S.M.: Beyond self-attention: external attention using two linear layers for visual tasks. IEEE Trans. Pattern Anal. Mach. Intell. 45(5), 5436\u20135447 (2022)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"19_CR13","doi-asserted-by":"crossref","unstructured":"Hassani, A., Walton, S., Li, J., Li, S., Shi, H.: Neighborhood attention transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6185\u20136194 (2023)","DOI":"10.1109\/CVPR52729.2023.00599"},{"key":"19_CR14","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"19_CR15","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. arXiv preprint arXiv:2304.02643 (2023)"},{"key":"19_CR16","doi-asserted-by":"crossref","unstructured":"Kong, Z., et\u00a0al.: Spvit: enabling faster vision transformers via soft token pruning. arXiv preprint arXiv:2112.13890 (2021)","DOI":"10.1007\/978-3-031-20083-0_37"},{"key":"19_CR17","doi-asserted-by":"crossref","unstructured":"Li, F., et al.: Mask dino: towards a unified transformer-based framework for object detection and segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3041\u20133050 (2023)","DOI":"10.1109\/CVPR52729.2023.00297"},{"key":"19_CR18","unstructured":"Li, Y., Yuan, G., Wen, Y., Hu, J., Evangelidis, G., Tulyakov, S., Wang, Y., Ren, J.: Efficientformer: Vision transformers at mobilenet speed. In: Advance in Neural Information Processing System,vol. 35, pp. 12934\u201312949 (2022)"},{"key":"19_CR19","first-page":"35462","volume":"35","author":"W Liang","year":"2022","unstructured":"Liang, W., et al.: Expediting large-scale vision transformer for dense prediction without fine-tuning. Adv. Neural. Inf. Process. Syst. 35, 35462\u201335477 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014 Part V. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"19_CR21","doi-asserted-by":"crossref","unstructured":"Liu, Z., et\u00a0al.: Swin transformer v2: scaling up capacity and resolution. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12009\u201312019 (2022)","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"19_CR22","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"19_CR23","doi-asserted-by":"crossref","unstructured":"Long, S., Zhao, Z., Pi, J., Wang, S., Wang, J.: Beyond attentive tokens: incorporating token importance and diversity for efficient vision transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10334\u201310343 (2023)","DOI":"10.1109\/CVPR52729.2023.00996"},{"key":"19_CR24","first-page":"21297","volume":"34","author":"J Lu","year":"2021","unstructured":"Lu, J., et al.: Soft: softmax-free transformer with linear complexity. Adv. Neural. Inf. Process. Syst. 34, 21297\u201321309 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR25","doi-asserted-by":"crossref","unstructured":"Mottaghi, R., et al.: The role of context for object detection and semantic segmentation in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 891\u2013898 (2014)","DOI":"10.1109\/CVPR.2014.119"},{"key":"19_CR26","unstructured":"Peng, H., Pappas, N., Yogatama, D., Schwartz, R., Smith, N.A., Kong, L.: Random feature attention. arXiv preprint arXiv:2103.02143 (2021)"},{"key":"19_CR27","doi-asserted-by":"crossref","unstructured":"Singh, M., et al.: Revisiting weakly supervised pre-training of visual perception models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 804\u2013814 (2022)","DOI":"10.1109\/CVPR52688.2022.00088"},{"key":"19_CR28","doi-asserted-by":"crossref","unstructured":"Strudel, R., Garcia, R., Laptev, I., Schmid, C.: Segmenter: transformer for semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7262\u20137272 (2021)","DOI":"10.1109\/ICCV48922.2021.00717"},{"key":"19_CR29","unstructured":"Tian, K., Jiang, Y., Diao, Q., Lin, C., Wang, L., Yuan, Z.: Designing Bert for convolutional networks: Sparse and hierarchical masked modeling. arXiv preprint arXiv:2301.03580 (2023)"},{"key":"19_CR30","first-page":"24261","volume":"34","author":"IO Tolstikhin","year":"2021","unstructured":"Tolstikhin, I.O., et al.: MLP-mixer: an all-MLP architecture for vision. Adv. Neural. Inf. Process. Syst. 34, 24261\u201324272 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR31","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"19_CR32","first-page":"11960","volume":"34","author":"Y Wang","year":"2021","unstructured":"Wang, Y., Huang, R., Song, S., Huang, Z., Huang, G.: Not all images are worth 16x16 words: dynamic transformers for efficient image recognition. Adv. Neural. Inf. Process. Syst. 34, 11960\u201311973 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR33","first-page":"13974","volume":"35","author":"Z Wang","year":"2022","unstructured":"Wang, Z., Luo, H., Wang, P., Ding, F., Wang, F., Li, H.: VTC-LFC: vision transformer compression with low-frequency components. Adv. Neural. Inf. Process. Syst. 35, 13974\u201313988 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR34","unstructured":"Yang, J., et al.: Focal self-attention for local-global interactions in vision transformers. arXiv preprint arXiv:2107.00641 (2021)"},{"key":"19_CR35","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"480","DOI":"10.1007\/978-3-031-20053-3_28","volume-title":"ECCV 2022","author":"R Yang","year":"2022","unstructured":"Yang, R., et al.: Scalablevit: rethinking the context-oriented generalization of vision transformer. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13684, pp. 480\u2013496. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20053-3_28"},{"key":"19_CR36","doi-asserted-by":"crossref","unstructured":"Ye, D., Lin, Y., Huang, Y., Sun, M.: TR-BERT: dynamic token reduction for accelerating Bert inference. arXiv preprint arXiv:2105.11618 (2021)","DOI":"10.18653\/v1\/2021.naacl-main.463"},{"key":"19_CR37","doi-asserted-by":"crossref","unstructured":"Yin, H., Vahdat, A., Alvarez, J.M., Mallya, A., Kautz, J., Molchanov, P.: A-vit: adaptive tokens for efficient vision transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10809\u201310818 (2022)","DOI":"10.1109\/CVPR52688.2022.01054"},{"key":"19_CR38","first-page":"12992","volume":"34","author":"Q Yu","year":"2021","unstructured":"Yu, Q., Xia, Y., Bai, Y., Lu, Y., Yuille, A.L., Shen, W.: Glance-and-gaze vision transformer. Adv. Neural. Inf. Process. Syst. 34, 12992\u201313003 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR39","doi-asserted-by":"crossref","unstructured":"Yu, W., et al.: Metaformer is actually what you need for vision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10819\u201310829 (2022)","DOI":"10.1109\/CVPR52688.2022.01055"},{"key":"19_CR40","doi-asserted-by":"crossref","unstructured":"Zeng, W., et al.: Not all tokens are equal: human-centric visual analysis via token clustering transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11101\u201311111 (2022)","DOI":"10.1109\/CVPR52688.2022.01082"},{"key":"19_CR41","unstructured":"Zeng, Z., et al.: VCC: scaling transformers to 128k tokens or more by prioritizing important tokens. arXiv preprint arXiv:2305.04241 (2023)"},{"key":"19_CR42","unstructured":"Zheng, M., et al.: End-to-end object detection with adaptive clustering transformer. arXiv preprint arXiv:2011.09315 (2020)"},{"key":"19_CR43","doi-asserted-by":"publisher","first-page":"302","DOI":"10.1007\/s11263-018-1140-0","volume":"127","author":"B Zhou","year":"2019","unstructured":"Zhou, B., et al.: Semantic understanding of scenes through the ade20k dataset. Int. J. Comput. Vis. 127, 302\u2013321 (2019)","journal-title":"Int. J. Comput. Vis."},{"key":"19_CR44","doi-asserted-by":"crossref","unstructured":"Ziwen, C., et al.: Autofocusformer: image segmentation off the grid. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18227\u201318236 (2023)","DOI":"10.1109\/CVPR52729.2023.01748"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73220-1_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T20:07:33Z","timestamp":1730578053000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73220-1_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031732195","9783031732201"],"references-count":44,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73220-1_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}