{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,30]],"date-time":"2026-06-30T15:58:30Z","timestamp":1782835110221,"version":"3.54.5"},"publisher-location":"Cham","reference-count":112,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031200823","type":"print"},{"value":"9783031200830","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-20083-0_37","type":"book-chapter","created":{"date-parts":[[2022,11,2]],"date-time":"2022-11-02T19:46:34Z","timestamp":1667418394000},"page":"620-640","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":153,"title":["SPViT: Enabling Faster Vision Transformers via\u00a0Latency-Aware Soft Token Pruning"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8120-4456","authenticated-orcid":false,"given":"Zhenglun","family":"Kong","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5287-5149","authenticated-orcid":false,"given":"Peiyan","family":"Dong","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1392-2787","authenticated-orcid":false,"given":"Xiaolong","family":"Ma","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2228-0587","authenticated-orcid":false,"given":"Xin","family":"Meng","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2697-7042","authenticated-orcid":false,"given":"Wei","family":"Niu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3540-1464","authenticated-orcid":false,"given":"Mengshu","family":"Sun","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4965-7321","authenticated-orcid":false,"given":"Xuan","family":"Shen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9844-992X","authenticated-orcid":false,"given":"Geng","family":"Yuan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4116-5237","authenticated-orcid":false,"given":"Bin","family":"Ren","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2077-1246","authenticated-orcid":false,"given":"Hao","family":"Tang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5172-5309","authenticated-orcid":false,"given":"Minghai","family":"Qin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3024-7990","authenticated-orcid":false,"given":"Yanzhi","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2022,11,3]]},"reference":[{"key":"37_CR1","doi-asserted-by":"crossref","unstructured":"Amini, A., Periyasamy, A.S., Behnke, S.: T6d-direct: transformers for multi-object 6d pose direct regression. arXiv preprint arXiv:2109.10948 (2021)","DOI":"10.1007\/978-3-030-92659-5_34"},{"key":"37_CR2","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: BEit: BERT pre-training of image transformers. In: International Conference on Learning Representations (2022). https:\/\/openreview.net\/forum?id=p-BhZSz59o4"},{"key":"37_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"37_CR4","doi-asserted-by":"crossref","unstructured":"Chang, S.E., et al.: Mix and match: a novel fpga-centric deep neural network quantization framework. In: 2021 IEEE International Symposium on High-Performance Computer Architecture (HPCA), pp. 208\u2013220. IEEE (2021)","DOI":"10.1109\/HPCA51647.2021.00027"},{"key":"37_CR5","doi-asserted-by":"crossref","unstructured":"Chefer, H., Gur, S., Wolf, L.: Transformer interpretability beyond attention visualization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 782\u2013791 (2021)","DOI":"10.1109\/CVPR46437.2021.00084"},{"key":"37_CR6","unstructured":"Chen, B., et al.: Psvit: better vision transformer via token pooling and attention sharing. arXiv preprint arXiv:2108.03428 (2021)"},{"key":"37_CR7","doi-asserted-by":"crossref","unstructured":"Chen, C.F.R., Fan, Q., Panda, R.: Crossvit: cross-attention multi-scale vision transformer for image classification. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 357\u2013366 (2021)","DOI":"10.1109\/ICCV48922.2021.00041"},{"key":"37_CR8","doi-asserted-by":"crossref","unstructured":"Chen, H., et al.: Pre-trained image processing transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12299\u201312310 (2021)","DOI":"10.1109\/CVPR46437.2021.01212"},{"key":"37_CR9","doi-asserted-by":"crossref","unstructured":"Chen, M., Peng, H., Fu, J., Ling, H.: Autoformer: searching transformers for visual recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 12270\u201312280 (2021)","DOI":"10.1109\/ICCV48922.2021.01205"},{"key":"37_CR10","unstructured":"Chen, P., Chen, Y., Liu, S., Yang, M., Jia, J.: Exploring and improving mobile level vision transformers. arXiv preprint arXiv:2108.13015 (2021)"},{"key":"37_CR11","unstructured":"Chen, T., Chen, X., Ma, X., Wang, Y., Wang, Z.: Coarsening the granularity: towards structurally sparse lottery tickets. In: Proceedings of the International Conference on Machine Learning (ICML) (2022)"},{"key":"37_CR12","unstructured":"Chen, T., Cheng, Y., Gan, Z., Yuan, L., Zhang, L., Wang, Z.: Chasing sparsity in vision transformers: an end-to-end exploration. In: Advances in Neural Information Processing Systems (2021)"},{"key":"37_CR13","unstructured":"Chen, T., Saxena, S., Li, L., Fleet, D.J., Hinton, G.: Pix2seq: a language modeling framework for object detection. arXiv preprint arXiv:2109.10852 (2021)"},{"key":"37_CR14","unstructured":"Chen, X., Hsieh, C.J., Gong, B.: When vision transformers outperform resnets without pre-training or strong data augmentations. In: International Conference on Learning Representations (2022). https:\/\/openreview.net\/forum?id=LtKcMgGOeLt"},{"key":"37_CR15","doi-asserted-by":"crossref","unstructured":"Chen, X., Yan, B., Zhu, J., Wang, D., Yang, X., Lu, H.: Transformer tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8126\u20138135 (2021)","DOI":"10.1109\/CVPR46437.2021.00803"},{"key":"37_CR16","unstructured":"Cheng, B., Schwing, A., Kirillov, A.: Per-pixel classification is not all you need for semantic segmentation. In: Beygelzimer, A., Dauphin, Y., Liang, P., Vaughan, J.W. (eds.) Advances in Neural Information Processing Systems (2021). https:\/\/openreview.net\/forum?id=0lz69oI5iZP"},{"key":"37_CR17","doi-asserted-by":"crossref","unstructured":"Chu, C., et al.: Pim-prune: fine-grain dcnn pruning for crossbar-based process-in-memory architecture. In: 2020 57th ACM\/IEEE Design Automation Conference (DAC), pp. 1\u20136. IEEE (2020)","DOI":"10.1109\/DAC18072.2020.9218523"},{"key":"37_CR18","unstructured":"Chu, X., et al.: Conditional positional encodings for vision transformers. arXiv preprint arXiv:2102.10882 (2021)"},{"key":"37_CR19","doi-asserted-by":"crossref","unstructured":"Dai, Z., Cai, B., Lin, Y., Chen, J.: Up-detr: unsupervised pre-training for object detection with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1601\u20131610 (2021)","DOI":"10.1109\/CVPR46437.2021.00165"},{"key":"37_CR20","doi-asserted-by":"publisher","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255 (2009). https:\/\/doi.org\/10.1109\/CVPR.2009.5206848","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"37_CR21","doi-asserted-by":"crossref","unstructured":"Deng, J., Yang, Z., Chen, T., Zhou, W., Li, H.: Transvg: end-to-end visual grounding with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 1769\u20131779 (2021)","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"37_CR22","unstructured":"Dosovitskiy, A., et al.: An image is worth 16 $$\\times $$ 16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2021). https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"37_CR23","unstructured":"El-Nouby, A., Neverova, N., Laptev, I., J\u00e9gou, H.: Training vision transformers for image retrieval. arXiv preprint arXiv:2102.05644 (2021)"},{"key":"37_CR24","unstructured":"El-Nouby, A., et al.: XCit: Cross-covariance image transformers. In: Beygelzimer, A., Dauphin, Y., Liang, P., Vaughan, J.W. (eds.) Advances in Neural Information Processing Systems (2021). https:\/\/openreview.net\/forum?id=kzPtpIpF8o"},{"key":"37_CR25","doi-asserted-by":"crossref","unstructured":"Fang, H., Mei, Z., Shrestha, A., Zhao, Z., Li, Y., Qiu, Q.: Encoding, model, and architecture: systematic optimization for spiking neural network in fpgas. In: 2020 IEEE\/ACM International Conference On Computer Aided Design (ICCAD), pp. 1\u20139. IEEE (2020)","DOI":"10.1145\/3400302.3415608"},{"key":"37_CR26","doi-asserted-by":"crossref","unstructured":"Fang, H., Shrestha, A., Zhao, Z., Qiu, Q.: Exploiting neuron and synapse filter dynamics in spatial temporal learning of deep spiking neural network. In: Proceedings of the Twenty-Ninth International Joint Conference on Artificial Intelligence. IJCAI 2020 (2021)","DOI":"10.24963\/ijcai.2020\/388"},{"key":"37_CR27","doi-asserted-by":"crossref","unstructured":"Fang, H., Taylor, B., Li, Z., Mei, Z., Li, H.H., Qiu, Q.: Neuromorphic algorithm-hardware codesign for temporal pattern learning. In: 2021 58th ACM\/IEEE Design Automation Conference (DAC), pp. 361\u2013366. IEEE (2021)","DOI":"10.1109\/DAC18074.2021.9586133"},{"key":"37_CR28","unstructured":"Fayyaz, M., et al.: Ats: adaptive token sampling for efficient vision transformers. arXiv preprint arXiv:2111.15667 (2021)"},{"key":"37_CR29","unstructured":"Gao, P., Lu, J., Li, H., Mottaghi, R., Kembhavi, A.: Container: context aggregation network. arXiv preprint arXiv:2106.01401 (2021)"},{"key":"37_CR30","doi-asserted-by":"crossref","unstructured":"Gong, Y., et al.: A privacy-preserving-oriented dnn pruning and mobile acceleration framework. In: Proceedings of the 2020 on Great Lakes Symposium on VLSI, pp. 119\u2013124 (2020)","DOI":"10.1145\/3386263.3407650"},{"key":"37_CR31","doi-asserted-by":"crossref","unstructured":"Graham, B., et al.: Levit: a vision transformer in convnet\u2019s clothing for faster inference. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 12259\u201312269 (2021)","DOI":"10.1109\/ICCV48922.2021.01204"},{"key":"37_CR32","doi-asserted-by":"crossref","unstructured":"Guo, C., et al.: Accelerating sparse dnn models without hardware-support via tile-wise sparsity. In: SC20: International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201315. IEEE (2020)","DOI":"10.1109\/SC41405.2020.00020"},{"issue":"2","key":"37_CR33","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1007\/s41095-021-0229-5","volume":"7","author":"MH Guo","year":"2021","unstructured":"Guo, M.H., Cai, J.X., Liu, Z.N., Mu, T.J., Martin, R.R., Hu, S.M.: Pct: point cloud transformer. Comput. Visual Media 7(2), 187\u2013199 (2021)","journal-title":"Comput. Visual Media"},{"key":"37_CR34","unstructured":"Han, K., Xiao, A., Wu, E., Guo, J., Xu, C., Wang, Y.: Transformer in transformer. In: Advances in Neural Information Processing Systems (2021)"},{"key":"37_CR35","doi-asserted-by":"crossref","unstructured":"Heo, B., Yun, S., Han, D., Chun, S., Choe, J., Oh, S.J.: Rethinking spatial dimensions of vision transformers. In: International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.01172"},{"key":"37_CR36","doi-asserted-by":"crossref","unstructured":"Hou, Z., et al.: Chex: channel exploration for cnn model compression. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 12287\u201312298 (2022)","DOI":"10.1109\/CVPR52688.2022.01197"},{"key":"37_CR37","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., Sun, G.: Squeeze-and-excitation networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7132\u20137141 (2018)","DOI":"10.1109\/CVPR.2018.00745"},{"key":"37_CR38","unstructured":"Hudson, D.A., Zitnick, C.L.: Generative adversarial transformers. In: Proceedings of the 38th International Conference on Machine Learning, ICML 2021 (2021)"},{"key":"37_CR39","unstructured":"Jia, D., et al.: Efficient vision transformers via fine-grained manifold distillation. arXiv preprint arXiv:2107.01378 (2021)"},{"key":"37_CR40","unstructured":"Jiang, Z., et al.: All tokens matter: token labeling for training better vision transformers. arXiv preprint arXiv:2104.10858 (2021)"},{"key":"37_CR41","doi-asserted-by":"crossref","unstructured":"Kim, B., Lee, J., Kang, J., Kim, E.S., Kim, H.J.: Hotr: end-to-end human-object interaction detection with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 74\u201383 (2021)","DOI":"10.1109\/CVPR46437.2021.00014"},{"key":"37_CR42","unstructured":"Kornblith, S., Norouzi, M., Lee, H., Hinton, G.: Similarity of neural network representations revisited. In: International Conference on Machine Learning, pp. 3519\u20133529. PMLR (2019)"},{"key":"37_CR43","doi-asserted-by":"crossref","unstructured":"Li, B., et al.: Efficient transformer-based large scale language representations using hardware-friendly block structured pruning. In: Findings of the Association for Computational Linguistics: EMNLP 2020, pp. 3187\u20133199 (2020)","DOI":"10.18653\/v1\/2020.findings-emnlp.286"},{"key":"37_CR44","doi-asserted-by":"crossref","unstructured":"Li, Y., Fang, H., Li, M., Ma, Y., Qiu, Q.: Neural network pruning and fast training for drl-based uav trajectory planning. In: 2022 27th Asia and South Pacific Design Automation Conference (ASP-DAC), pp. 574\u2013579. IEEE (2022)","DOI":"10.1109\/ASP-DAC52403.2022.9712561"},{"key":"37_CR45","doi-asserted-by":"crossref","unstructured":"Li, Z., et al.: Revisiting stereo depth estimation from a sequence-to-sequence perspective with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6197\u20136206 (2021)","DOI":"10.1109\/ICCV48922.2021.00614"},{"key":"37_CR46","unstructured":"Liang, Y., GE, C., Tong, Z., Song, Y., Wang, J., Xie, P.: EVit: expediting vision transformers via token reorganizations. In: International Conference on Learning Representations (2022). https:\/\/openreview.net\/forum?id=BjyvwnXXVn_"},{"key":"37_CR47","unstructured":"Liu, N., et al.: Lottery ticket preserves weight correlation: is it desirable or not? In: International Conference on Machine Learning (ICML), pp. 7011\u20137020. PMLR (2021)"},{"key":"37_CR48","unstructured":"Liu, Y., Sangineto, E., Bi, W., Sebe, N., Lepri, B., De Nadai, M.: Efficient training of visual transformers with small-size datasets. arXiv preprint arXiv:2106.03746 (2021)"},{"key":"37_CR49","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"37_CR50","doi-asserted-by":"crossref","unstructured":"Lu, Z., Liu, H., Li, J., Zhang, L.: Efficient transformer for single image super-resolution. arXiv preprint arXiv:2108.11084 (2021)","DOI":"10.1109\/CVPRW56347.2022.00061"},{"key":"37_CR51","doi-asserted-by":"crossref","unstructured":"Ma, X., et al.: PCONV: the missing but desirable sparsity in DNN weight pruning for real-time execution on mobile devices. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), vol. 34, pp. 5117\u20135124 (2020)","DOI":"10.1609\/aaai.v34i04.5954"},{"key":"37_CR52","doi-asserted-by":"crossref","unstructured":"Ma, X., et al.: Non-structured dnn weight pruning-is it beneficial in any platform? In: IEEE Transactions on Neural Networks and Learning Systems (TNNLS) (2021)","DOI":"10.1109\/TNNLS.2021.3063265"},{"key":"37_CR53","doi-asserted-by":"publisher","unstructured":"Ma, X., et al.: An image enhancing pattern-based sparsity for real-time inference on mobile devices. In: Proceedings of the European conference on computer vision (ECCV). pp. 629\u2013645. Springer (2020). https:\/\/doi.org\/10.1007\/978-3-030-58601-0_37","DOI":"10.1007\/978-3-030-58601-0_37"},{"key":"37_CR54","unstructured":"Ma, X., et al.: Effective model sparsification by scheduled grow-and-prune methods. In: Proceedings of the International Conference on Learning Representations (ICLR) (2021)"},{"key":"37_CR55","doi-asserted-by":"crossref","unstructured":"Ma, X., et al.: Blcr: Towards real-time dnn execution with block-based reweighted pruning. In: International Symposium on Quality Electronic Design (ISQED), pp. 1\u20138. IEEE (2022)","DOI":"10.1109\/ISQED54688.2022.9806237"},{"key":"37_CR56","doi-asserted-by":"crossref","unstructured":"Ma, X., et al.: Tiny but accurate: a pruned, quantized and optimized memristor crossbar framework for ultra efficient dnn implementation. In: 2020 25th Asia and South Pacific design automation conference (ASP-DAC), pp. 301\u2013306. IEEE (2020)","DOI":"10.1109\/ASP-DAC47756.2020.9045658"},{"key":"37_CR57","unstructured":"Ma, X., et al.: Sanity checks for lottery tickets: Does your winning ticket really win the jackpot? In: Advances in Neural Information Processing Systems (NeurIPS) 34 (2021)"},{"key":"37_CR58","unstructured":"Mao, M., et al.: Dual-stream network for visual recognition. In: Advances in Neural Information Processing Systems (2021)"},{"key":"37_CR59","doi-asserted-by":"crossref","unstructured":"Meinhardt, T., Kirillov, A., Leal-Taixe, L., Feichtenhofer, C.: Trackformer: multi-object tracking with transformers. arXiv preprint arXiv:2101.02702 (2021)","DOI":"10.1109\/CVPR52688.2022.00864"},{"key":"37_CR60","doi-asserted-by":"crossref","unstructured":"Misra, I., Girdhar, R., Joulin, A.: An end-to-end transformer model for 3d object detection. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00290"},{"key":"37_CR61","doi-asserted-by":"crossref","unstructured":"Niu, W., et al.: A compression-compilation framework for on-mobile real-time bert applications. arXiv preprint arXiv:2106.00526 (2021)","DOI":"10.24963\/ijcai.2021\/712"},{"key":"37_CR62","doi-asserted-by":"crossref","unstructured":"Niu, W., et al.: Grim: A general, real-time deep learning inference framework for mobile devices based on fine-grained structured weight sparsity. In: IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI) (2021)","DOI":"10.1109\/TPAMI.2021.3089687"},{"key":"37_CR63","doi-asserted-by":"crossref","unstructured":"Niu, W., et al.: Patdnn: achieving real-time dnn execution on mobile devices with pattern-based weight pruning. In: Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS), pp. 907\u2013922 (2020)","DOI":"10.1145\/3373376.3378534"},{"key":"37_CR64","unstructured":"Pan, B., Jiang, Y., Panda, R., Wang, Z., Feris, R., Oliva, A.: Ia-red$$^2$$: Interpretability-aware redundancy reduction for vision transformers. In: Advances in Neural Information Processing Systems (2021)"},{"key":"37_CR65","doi-asserted-by":"crossref","unstructured":"Pan, Z., Zhuang, B., Liu, J., He, H., Cai, J.: Scalable vision transformers with hierarchical pooling. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 377\u2013386 (2021)","DOI":"10.1109\/ICCV48922.2021.00043"},{"key":"37_CR66","unstructured":"Prillo, S., Eisenschlos, J.: Softsort: a continuous relaxation for the argsort operator. In: International Conference on Machine Learning, pp. 7793\u20137802. PMLR (2020)"},{"key":"37_CR67","doi-asserted-by":"crossref","unstructured":"Radosavovic, I., Kosaraju, R.P., Girshick, R., He, K., Doll\u00e1r, P.: Designing network design spaces. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10428\u201310436 (2020)","DOI":"10.1109\/CVPR42600.2020.01044"},{"key":"37_CR68","unstructured":"Raghu, M., Unterthiner, T., Kornblith, S., Zhang, C., Dosovitskiy, A.: Do vision transformers see like convolutional neural networks? arXiv preprint arXiv:2108.08810 (2021)"},{"key":"37_CR69","unstructured":"Rao, Y., Zhao, W., Liu, B., Lu, J., Zhou, J., Hsieh, C.J.: Dynamicvit: efficient vision transformers with dynamic token sparsification. In: Advances in Neural Information Processing Systems (2021)"},{"key":"37_CR70","doi-asserted-by":"crossref","unstructured":"Ren, A., et al.: Admm-nn: an algorithm-hardware co-design framework of dnns using alternating direction methods of multipliers. In: Proceedings of the Twenty-Fourth International Conference on Architectural Support for Programming Languages and Operating Systems, pp. 925\u2013938 (2019)","DOI":"10.1145\/3297858.3304076"},{"key":"37_CR71","unstructured":"Renggli, C., Pinto, A.S., Houlsby, N., Mustafa, B., Puigcerver, J., Riquelme, C.: Learning to merge tokens in vision transformers. arXiv preprint arXiv:2202.12015 (2022)"},{"key":"37_CR72","doi-asserted-by":"crossref","unstructured":"Rumi, M.A., Ma, X., Wang, Y., Jiang, P.: Accelerating sparse cnn inference on gpus with performance-aware weight pruning. In: Proceedings of the ACM International Conference on Parallel Architectures and Compilation Techniques (PACT), pp. 267\u2013278 (2020)","DOI":"10.1145\/3410463.3414648"},{"key":"37_CR73","unstructured":"Ryoo, M.S., Piergiovanni, A., Arnab, A., Dehghani, M., Angelova, A.: Tokenlearner: what can 8 learned tokens do for images and videos? In: Advances in Neural Information Processing Systems (2021)"},{"key":"37_CR74","unstructured":"Sanh, V., Wolf, T., Rush, A.M.: Movement pruning: adaptive sparsity by fine-tuning. arXiv preprint arXiv:2005.07683 (2020)"},{"key":"37_CR75","doi-asserted-by":"crossref","unstructured":"Srinivas, A., Lin, T.Y., Parmar, N., Shlens, J., Abbeel, P., Vaswani, A.: Bottleneck transformers for visual recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16519\u201316529 (2021)","DOI":"10.1109\/CVPR46437.2021.01625"},{"key":"37_CR76","unstructured":"Steiner, A., Kolesnikov, A., Zhai, X., Wightman, R., Uszkoreit, J., Beyer, L.: How to train your vit? data, augmentation, and regularization in vision transformers. arXiv preprint arXiv:2106.10270 (2021)"},{"key":"37_CR77","doi-asserted-by":"crossref","unstructured":"Tan, Z., et al.: Pcnn: pattern-based fine-grained regular pruning towards optimizing cnn accelerators. In: 2020 57th ACM\/IEEE Design Automation Conference (DAC), pp. 1\u20136. IEEE (2020)","DOI":"10.1109\/DAC18072.2020.9218498"},{"key":"37_CR78","doi-asserted-by":"crossref","unstructured":"Tang, Y., et al.: Patch slimming for efficient vision transformers (2021)","DOI":"10.1109\/CVPR52688.2022.01185"},{"key":"37_CR79","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., J\u2019egou, H.: Training data-efficient image transformers & distillation through attention. In: ICML (2021)"},{"key":"37_CR80","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 5998\u20136008 (2017)"},{"key":"37_CR81","doi-asserted-by":"crossref","unstructured":"Wang, H., Zhang, Z., Han, S.: Spatten: efficient sparse attention architecture with cascade token and head pruning. In: 2021 IEEE International Symposium on High-Performance Computer Architecture (HPCA), pp. 97\u2013110. IEEE (2021)","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"37_CR82","doi-asserted-by":"crossref","unstructured":"Wang, P., et al.: Kvt: k-nn attention for boosting vision transformers. arXiv preprint arXiv:2106.00515 (2021)","DOI":"10.1007\/978-3-031-20053-3_17"},{"key":"37_CR83","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: Pyramid vision transformer: a versatile backbone for dense prediction without convolutions. In: IEEE ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"37_CR84","unstructured":"Wu, B., et al.: Visual transformers: token-based image representation and processing for computer vision. arXiv preprint arXiv:2006.03677 (2020)"},{"key":"37_CR85","doi-asserted-by":"crossref","unstructured":"Wu, H., et al.: Cvt: introducing convolutions to vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 22\u201331 (2021)","DOI":"10.1109\/ICCV48922.2021.00009"},{"key":"37_CR86","doi-asserted-by":"crossref","unstructured":"Wu, K., Peng, H., Chen, M., Fu, J., Chao, H.: Rethinking and improving relative position encoding for vision transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10033\u201310041 (2021)","DOI":"10.1109\/ICCV48922.2021.00988"},{"key":"37_CR87","doi-asserted-by":"crossref","unstructured":"Xu, C., et al.: You only group once: efficient point-cloud processing with token representation and relation inference module. arXiv preprint arXiv:2103.09975 (2021)","DOI":"10.1109\/IROS51168.2021.9636858"},{"key":"37_CR88","doi-asserted-by":"crossref","unstructured":"Xu, W., Xu, Y., Chang, T., Tu, Z.: Co-scale conv-attentional image transformers. arXiv preprint arXiv:2104.06399 (2021)","DOI":"10.1109\/ICCV48922.2021.00983"},{"key":"37_CR89","doi-asserted-by":"crossref","unstructured":"Xu, Y., et al.: Evo-vit: slow-fast token evolution for dynamic vision transformer. In: Proceedings of the AAAI Conference on Artificial Intelligence (2022)","DOI":"10.1609\/aaai.v36i3.20202"},{"key":"37_CR90","doi-asserted-by":"crossref","unstructured":"Xue, F., Wang, Q., Guo, G.: Transfer: learning relation-aware facial expression representations with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3601\u20133610 (2021)","DOI":"10.1109\/ICCV48922.2021.00358"},{"key":"37_CR91","doi-asserted-by":"crossref","unstructured":"Yan, B., Peng, H., Fu, J., Wang, D., Lu, H.: Learning spatio-temporal transformer for visual tracking. arXiv preprint arXiv:2103.17154 (2021)","DOI":"10.1109\/ICCV48922.2021.01028"},{"key":"37_CR92","doi-asserted-by":"crossref","unstructured":"Yang, C., Wu, Z., Zhou, B., Lin, S.: Instance localization for self-supervised detection pretraining. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3987\u20133996 (2021)","DOI":"10.1109\/CVPR46437.2021.00398"},{"key":"37_CR93","doi-asserted-by":"crossref","unstructured":"Yang, F., Yang, H., Fu, J., Lu, H., Guo, B.: Learning texture transformer network for image super-resolution. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5791\u20135800 (2020)","DOI":"10.1109\/CVPR42600.2020.00583"},{"key":"37_CR94","doi-asserted-by":"crossref","unstructured":"Yang, G., Tang, H., Ding, M., Sebe, N., Ricci, E.: Transformer-based attention networks for continuous pixel-wise prediction. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01596"},{"key":"37_CR95","unstructured":"Yu, H., Wu, J.: A unified pruning framework for vision transformers. arXiv preprint arXiv:2111.15127 (2021)"},{"key":"37_CR96","unstructured":"Yu, Q., Xia, Y., Bai, Y., Lu, Y., Yuille, A., Shen, W.: Glance-and-gaze vision transformer. In: Advances in Neural Information Processing Systems (2021)"},{"key":"37_CR97","unstructured":"Yu, S., et al.: Unified visual transformer compression. In: International Conference on Learning Representations (2022). https:\/\/openreview.net\/forum?id=9jsZiUgkCZP"},{"key":"37_CR98","doi-asserted-by":"crossref","unstructured":"Yuan, G., et al.: Tinyadc: Peripheral circuit-aware weight pruning framework for mixed-signal dnn accelerators. In: 2021 Design, Automation & Test in Europe Conference & Exhibition (DATE), pp. 926\u2013931. IEEE (2021)","DOI":"10.23919\/DATE51398.2021.9474235"},{"key":"37_CR99","doi-asserted-by":"crossref","unstructured":"Yuan, G., et al.: Improving dnn fault tolerance using weight pruning and differential crossbar mapping for reram-based edge ai. In: 2021 22nd International Symposium on Quality Electronic Design (ISQED), pp. 135\u2013141. IEEE (2021)","DOI":"10.1109\/ISQED51717.2021.9424332"},{"key":"37_CR100","doi-asserted-by":"crossref","unstructured":"Yuan, G., et al.: An ultra-efficient memristor-based dnn framework with structured weight pruning and quantization using admm. In: 2019 IEEE\/ACM International Symposium on Low Power Electronics and Design (ISLPED), pp. 1\u20136. IEEE (2019)","DOI":"10.1109\/ISLPED.2019.8824944"},{"key":"37_CR101","unstructured":"Yuan, G., et al.: Mest: accurate and fast memory-economic sparse training framework on the edge. In: Advances in Neural Information Processing Systems (NeurIPS) 34 (2021)"},{"key":"37_CR102","doi-asserted-by":"crossref","unstructured":"Yuan, K., Guo, S., Liu, Z., Zhou, A., Yu, F., Wu, W.: Incorporating convolution designs into visual transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 579\u2013588 (2021)","DOI":"10.1109\/ICCV48922.2021.00062"},{"key":"37_CR103","doi-asserted-by":"crossref","unstructured":"Yuan, L., et al.: Tokens-to-token vit: training vision transformers from scratch on imagenet. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 558\u2013567 (2021)","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"37_CR104","doi-asserted-by":"crossref","unstructured":"Yuan, L., et al.: Tokens-to-token vit: training vision transformers from scratch on imagenet. arXiv preprint arXiv:2101.11986 (2021)","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"37_CR105","doi-asserted-by":"crossref","unstructured":"Yue, X., Sun, S., Kuang, Z., Wei, M., Torr, P.H., Zhang, W., Lin, D.: Vision transformer with progressive sampling. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 387\u2013396 (2021)","DOI":"10.1109\/ICCV48922.2021.00044"},{"key":"37_CR106","doi-asserted-by":"crossref","unstructured":"Zhai, X., Kolesnikov, A., Houlsby, N., Beyer, L.: Scaling vision transformers. arXiv preprint arXiv:2106.04560 (2021)","DOI":"10.1109\/CVPR52688.2022.01179"},{"key":"37_CR107","doi-asserted-by":"crossref","unstructured":"Zhang, T., et al.: A unified dnn weight pruning framework using reweighted optimization methods. In: 2021 58th ACM\/IEEE Design Automation Conference (DAC), pp. 493\u2013498. IEEE (2021)","DOI":"10.1109\/DAC18074.2021.9586152"},{"key":"37_CR108","doi-asserted-by":"crossref","unstructured":"Zhang, T., et al.: Structadmm: achieving ultrahigh efficiency in structured pruning for dnns. In: IEEE Transactions on Neural Networks and Learning Systems (TNNLS) (2021)","DOI":"10.1109\/TNNLS.2020.3045153"},{"key":"37_CR109","doi-asserted-by":"crossref","unstructured":"Zhao, H., Jiang, L., Jia, J., Torr, P.H., Koltun, V.: Point transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 16259\u201316268 (2021)","DOI":"10.1109\/ICCV48922.2021.01595"},{"key":"37_CR110","doi-asserted-by":"crossref","unstructured":"Zheng, S., et al.: Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6881\u20136890 (2021)","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"37_CR111","unstructured":"Zhou, D., et al.: Refiner: refining self-attention for vision transformers (2021)"},{"key":"37_CR112","unstructured":"Zhu, M., Han, K., Tang, Y., Wang, Y.: Visual transformer pruning. In: KDD 2021 Workshop on Model Mining (2021)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-20083-0_37","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,13]],"date-time":"2024-03-13T12:08:35Z","timestamp":1710331715000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-20083-0_37"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031200823","9783031200830"],"references-count":112,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-20083-0_37","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"3 November 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}