{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T21:57:39Z","timestamp":1776203859988,"version":"3.50.1"},"reference-count":245,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"1","license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"MindSpore"},{"name":"Compute Architecture for Neural Networks"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2023,1,1]]},"DOI":"10.1109\/tpami.2022.3152247","type":"journal-article","created":{"date-parts":[[2022,2,18]],"date-time":"2022-02-18T20:31:49Z","timestamp":1645216309000},"page":"87-110","source":"Crossref","is-referenced-by-count":3636,"title":["A Survey on Vision Transformer"],"prefix":"10.1109","volume":"45","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9761-2702","authenticated-orcid":false,"given":"Kai","family":"Han","sequence":"first","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2709-4946","authenticated-orcid":false,"given":"Yunhe","family":"Wang","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab, Beijing, China"}]},{"given":"Hanting","family":"Chen","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2102-8235","authenticated-orcid":false,"given":"Xinghao","family":"Chen","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2981-1953","authenticated-orcid":false,"given":"Jianyuan","family":"Guo","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab, Beijing, China"}]},{"given":"Zhenhua","family":"Liu","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab, Beijing, China"}]},{"given":"Yehui","family":"Tang","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0909-3234","authenticated-orcid":false,"given":"An","family":"Xiao","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab, Beijing, China"}]},{"given":"Chunjing","family":"Xu","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab, Beijing, China"}]},{"given":"Yixing","family":"Xu","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab, Beijing, China"}]},{"given":"Zhaohui","family":"Yang","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4494-4196","authenticated-orcid":false,"given":"Yiman","family":"Zhang","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5979-578X","authenticated-orcid":false,"given":"Dacheng","family":"Tao","sequence":"additional","affiliation":[{"name":"School of Computer Science, Faculty of Engineering, University of Sydney, Darlington, NSW, Australia"}]}],"member":"263","reference":[{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01271"},{"key":"ref172","first-page":"1282","article-title":"What are they doing?: Collective activity classification using spatio-temporal relationship among people","author":"choi","year":"0","journal-title":"Proc IEEE CVF Int Conf Comput Vis"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00058"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00331"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00092"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01035"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00194"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01151"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2734779"},{"key":"ref169","first-page":"6299","author":"carreira","year":"2017","journal-title":"Proc Conf Comput Vis Pattern Recognit"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00583"},{"key":"ref38","article-title":"TransGAN: Two transformers can make one strong GAN","author":"jiang","year":"0","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00863"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00165"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"ref30","article-title":"Swin-Unet: Unet-like pure transformer for medical image segmentation","author":"cao","year":"2021"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00199"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413775"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58595-2_2"},{"key":"ref181","article-title":"Visualbert: A simple and performant baseline for vision and language","author":"li","year":"2019"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"ref185","article-title":"The lottery ticket hypothesis: Finding sparse, trainable neural networks","author":"frankle","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.259"},{"key":"ref183","first-page":"4168","article-title":"SpeechBERT: Cross-modal pre-trained language model for end-to-end spoken question answering","author":"chuang","year":"2020","journal-title":"Proc Conf Interspeech"},{"key":"ref182","article-title":"VL-BERT: Pre-training of generic visual-linguistic representations","author":"su","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref189","article-title":"Albert: A lite BERT for self-supervised learning of language representations","author":"lan","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.298"},{"key":"ref187","article-title":"Vision transformer pruning","author":"zhu","year":"2021"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01185"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_31"},{"key":"ref27","first-page":"4055","article-title":"Image transformer","author":"parmar","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58583-9_25"},{"key":"ref29","article-title":"Transformer in transformer","author":"han","year":"0","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00911"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.4324\/9781410608918"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/3496.001.0001"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/s11265-013-0768-9"},{"key":"ref26","article-title":"CVonline: The evolving, distributed, non-proprietary, on-line compendium of computer vision","author":"fisher","year":"2008"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00542"},{"key":"ref50","article-title":"Gaussian error linear units (GELUS)","author":"hendrycks","year":"2016"},{"key":"ref51","article-title":"Layer Normalization","author":"ba","year":"2016"},{"key":"ref154","article-title":"TRTR: Visual tracking with transformer","author":"zhao","year":"2021"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00162"},{"key":"ref156","article-title":"TransTrack: Multiple object tracking with transformer","author":"sun","year":"2021"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00803"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i6.16636"},{"key":"ref152","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"raffel","year":"2020","journal-title":"J Mach Learn Res"},{"key":"ref151","article-title":"Improving visual reasoning by exploiting the knowledge in texts","author":"sharifzadeh","year":"2021"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9412265"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00375"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3137605"},{"key":"ref149","first-page":"670","article-title":"Graph R-CNN for scene graph generation","author":"yang","year":"0","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref59","first-page":"10347","article-title":"Training data-efficient image transformers & distillation through attention","author":"touvron","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref58","article-title":"Visual transformers: Token-based image representation and processing for computer vision","author":"wu","year":"2020"},{"key":"ref57","first-page":"1352","article-title":"Rezero is all you need: Fast convergence at large depth","author":"bachlechner","year":"0","journal-title":"Proc Conf Uncertainty Artif Intell"},{"key":"ref56","first-page":"4381","article-title":"Understanding and improving layer normalization","author":"xu","year":"0","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref55","first-page":"8741","article-title":"Rethinking batch normalization in transformers","author":"shen","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref54","first-page":"448","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","author":"ioffe","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1176"},{"key":"ref52","article-title":"Adaptive input representations for neural language modeling","author":"baevski","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref40","article-title":"Learning transferable visual models from natural language supervision","author":"radford","year":"2021"},{"key":"ref167","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00033"},{"key":"ref166","article-title":"ConvTransformer: A convolutional transformer network for video frame synthesis","author":"liu","year":"2020"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.1109\/3DV53792.2021.00021"},{"key":"ref164","first-page":"6309","article-title":"Neural discrete representation learning","author":"oord","year":"2017"},{"key":"ref163","article-title":"VitGAN: Training GANs with vision transformers","author":"lee","year":"2021"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01595"},{"key":"ref161","doi-asserted-by":"crossref","first-page":"187","DOI":"10.1007\/s41095-021-0229-5","article-title":"Point cloud transformer","volume":"7","author":"guo","year":"2021","journal-title":"Computational Visual Media"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3116304"},{"key":"ref4","first-page":"1097","article-title":"ImageNet classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/5.726791"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/5236.001.0001"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1244"},{"key":"ref159","article-title":"Spatiotemporal transformer for video-based person re-identification","author":"zhang","year":"2021"},{"key":"ref7","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-2074"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01474"},{"key":"ref9","first-page":"6000","article-title":"Attention is all you need","author":"vaswani","year":"0","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref158","article-title":"A video is worth three views: Trigeminal transformers for video-based person re-identification","author":"liu","year":"2021"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.1"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.372"},{"key":"ref48","first-page":"1243","article-title":"Convolutional sequence to sequence learning","author":"gehring","year":"2017","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref47","first-page":"12837","article-title":"Improving BERT with span-based dynamic convolution","author":"jiang","year":"2020","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref42","article-title":"Mastering text-to-image generation via transformers","author":"ding","year":"0","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref41","first-page":"8821","article-title":"Zero-shot text-to-image generation","author":"ramesh","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref44","first-page":"14014","article-title":"Are sixteen heads really better than one?","author":"michel","year":"0"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00147"},{"key":"ref73","article-title":"Visual Parser: Representing part-whole hierarchies with transformers","author":"sun","year":"2021"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"ref71","article-title":"Cross-covariance image transformers","author":"el-nouby","year":"2021"},{"key":"ref70","article-title":"Refiner: Refining self-attention for vision transformers","author":"zhou","year":"2021"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20099"},{"key":"ref77","first-page":"377","article-title":"Scalable visual transformers with hierarchical pooling","author":"pan","year":"0","journal-title":"Proc Int Conf Comput Vis"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20252"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01172"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00041"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref62","article-title":"Twins: Revisiting the design of spatial attention in vision transformers","author":"chu","year":"2021"},{"key":"ref61","article-title":"Regional-to-local attention for vision transformers","author":"chen","year":"2021"},{"key":"ref63","article-title":"CAT: Cross attention in vision transformer","author":"lin","year":"2021"},{"key":"ref64","article-title":"CSWin transformer: A general vision transformer backbone with cross-shaped windows","author":"dong","year":"2021"},{"key":"ref65","article-title":"Shuffle transformer: Rethinking spatial shuffle for vision transformer","author":"huang","year":"2021"},{"key":"ref66","article-title":"MSG-transformer: Exchanging local spatial information by manipulating messenger tokens","author":"fang","year":"2021"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"ref68","article-title":"DeepViT: Towards deeper vision transformer","author":"zhou","year":"2021"},{"key":"ref69","article-title":"KVT: K-NN attention for boosting vision transformers","author":"wang","year":"2021"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.1007\/s11431-020-1647-3"},{"key":"ref198","article-title":"Reducing transformer depth on demand with structured dropout","author":"fan","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref199","first-page":"9782","article-title":"Dynamic BERT with adaptive width and depth","author":"hou","year":"2020","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref193","article-title":"A distilled version of BERT: Smaller, faster, cheaper and lighter","author":"sanh","year":"2019"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1441"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.195"},{"key":"ref196","article-title":"Well-read students learn better: The impact of student initialization on knowledge distillation","author":"turc","year":"2019"},{"key":"ref95","article-title":"VOLO: Vision outlooker for visual recognition","author":"yuan","year":"2021"},{"key":"ref94","article-title":"bibinfotitleConvolutional neural networks meet vision transformers","author":"guo","year":"2021"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.633"},{"key":"ref93","first-page":"6105","article-title":"EfficientNet: Rethinking model scaling for convolutional neural networks","author":"tan","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6409"},{"key":"ref92","article-title":"Feedforward networks for image classification with data-efficient training","author":"touvron","year":"2021"},{"key":"ref192","article-title":"Quantized 8 bit BERT","author":"zafrir","year":"2019"},{"key":"ref91","article-title":"Beyond self-attention: External attention using two linear layers for visual tasks","author":"guo","year":"2021"},{"key":"ref90","article-title":"Do you even need attention? A stack of feed-forward layers does surprisingly well on imagenet","author":"melas-kyriazi","year":"2021"},{"key":"ref98","article-title":"LocalViT: Bringing locality to vision transformers","author":"li","year":"2021"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01204"},{"key":"ref96","article-title":"CVT:Introducing convolutions to vision transformers","author":"wu","year":"2021"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00062"},{"key":"ref82","article-title":"Vision transformer architecture search","author":"su","year":"2021"},{"key":"ref81","article-title":"Scaling vision transformers","author":"zhai","year":"2021"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00008"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01205"},{"key":"ref80","article-title":"Uformer: A general U-shaped transformer for image restoration","author":"wang","year":"2021"},{"key":"ref89","article-title":"MLP-mixer: An all-MLP architecture for vision","author":"tolstikhin","year":"2021"},{"key":"ref85","article-title":"Conditional positional encodings for vision transformers","author":"chu","year":"2021"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00988"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00010"},{"key":"ref88","article-title":"Augmented shortcuts for vision transformers","author":"tang","year":"0","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.496"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00063"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01625"},{"key":"ref209","first-page":"4091","article-title":"Searching for low-bit weights in quantized neural networks","author":"yang","year":"0","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref203","first-page":"2654","article-title":"Do deep nets really need to be deep?","author":"ba","year":"2014","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.202"},{"key":"ref201","article-title":"Distilling the knowledge in a neural network","author":"hinton","year":"2015"},{"key":"ref202","first-page":"535","article-title":"Model compression","author":"bucilu?","year":"0","journal-title":"Proc 12th ACM SIGKDD Int Conf Knowl Discov Data Mining"},{"key":"ref207","article-title":"Efficient vision transformers via fine-grained manifold distillation","author":"jia","year":"2021"},{"key":"ref208","article-title":"Improving the speed of neural networks on CPUs","author":"vanhoucke","year":"0","journal-title":"Proc Int Conf Neural Inf Process Syst Workshop"},{"key":"ref205","article-title":"Deep self-attention distillation for task-agnostic compression of pre-trained transformers","author":"wang","year":"2020"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.5963"},{"key":"ref211","first-page":"379","article-title":"Riptide: Fast end-to-end binarized neural networks","volume":"2","author":"fromm","year":"2020","journal-title":"Proc Mach Learn Syst"},{"key":"ref210","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58539-6_26"},{"key":"ref212","article-title":"ProxQuant: Quantized neural networks via proximal operators","author":"bai","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref213","article-title":"Efficient 8-bit quantization of transformer neural machine language translation model","author":"bhandare","year":"2019"},{"key":"ref214","article-title":"Quantized transformer","author":"fan","year":"2019"},{"key":"ref215","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.sustainlp-1.4"},{"key":"ref216","article-title":"Transformers. zip: Compressing transformers with pruning and quantization","author":"cheong","year":"2019"},{"key":"ref217","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-60450-9_29"},{"key":"ref218","article-title":"Post-training quantization for vision transformer","author":"liu","year":"0","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref219","article-title":"Lite transformer with long-short range attention","author":"wu","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref220","article-title":"Is attention better than matrix decomposition","author":"geng","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref222","first-page":"5877","article-title":"The evolved transformer","author":"so","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref221","first-page":"737","article-title":"Neural architecture transformer for accurate and compact architectures","author":"guo","year":"0","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref229","article-title":"Learning multiple layers of features from tiny images","author":"krizhevsky","year":"2009"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.252631999"},{"key":"ref227","doi-asserted-by":"publisher","DOI":"10.1137\/08074489X"},{"key":"ref226","first-page":"17283","article-title":"Big bird: Transformers for longer sequences","author":"zaheer","year":"0","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref225","first-page":"13783","article-title":"$o(n)$o(n) connections are expressive enough: Universal approximability of sparse transformers","author":"yun","year":"0","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref224","first-page":"5156","article-title":"Transformers are RNNs: Fast autoregressive transformers with linear attention","author":"katharopoulos","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref223","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01206"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"ref126","article-title":"You only look at one sequence: Rethinking transformer in vision through object detection","author":"fang","year":"0","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00972"},{"key":"ref124","article-title":"Efficient DETR: Improving end-to-end object detector with dense prior","author":"yao","year":"2021"},{"key":"ref129","article-title":"DETReg: Unsupervised pretraining with region priors for object detection","author":"bar","year":"2021"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00644"},{"key":"ref130","article-title":"ISTR: End-to-end instance segmentation with transformers","author":"hu","year":"2021"},{"key":"ref133","article-title":"Segmenting objects by learning queries","author":"dong","year":"0","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref134","first-page":"7262","article-title":"Transformer for semantic segmentation","author":"strudel","year":"0","journal-title":"Proc Int Conf Comput Vis"},{"key":"ref131","article-title":"Associating objects with transformers for video object segmentation","author":"yang","year":"0","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref132","article-title":"Fully transformer networks for semantic image segmentation","author":"wu","year":"2021"},{"key":"ref232","doi-asserted-by":"publisher","DOI":"10.1145\/3374217"},{"key":"ref233","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00774"},{"key":"ref230","article-title":"A large-scale study of representation learning with the visual task adaptation benchmark","author":"zhai","year":"2019"},{"key":"ref231","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1425"},{"key":"ref239","article-title":"Towards understanding the role of over-parametrization in generalization of neural networks","author":"neyshabur","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref238","first-page":"855","article-title":"On the computational efficiency of training neural networks","author":"livni","year":"0","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref235","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1282"},{"key":"ref234","article-title":"Towards robust vision transformer","author":"mao","year":"2021"},{"key":"ref237","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00084"},{"key":"ref236","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1002"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-87193-2_4"},{"key":"ref135","article-title":"SegFormer: Simple and efficient design for semantic segmentation with transformers","author":"xie","year":"0","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref138","first-page":"652","article-title":"PointNet: Deep learning on point sets for 3D classification and segmentation","author":"qi","year":"0","journal-title":"Proc Conf Comput Vis Pattern Recognit"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1109\/BIBM49941.2020.9313305"},{"key":"ref139","first-page":"5099","article-title":"PointNet: Deep hierarchical feature learning on point sets in a metric space","author":"qi","year":"2017","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref140","article-title":"HandsFormer: Keypoint transformer for monocular 3D pose estimation ofhands and object in interaction","author":"hampali","year":"2021"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01112"},{"key":"ref142","article-title":"Direct human pose estimation with transformers","author":"mao","year":"2021"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00378"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1001\/archpsyc.1962.01720030064010"},{"key":"ref144","article-title":"Test-time personalization with a transformer for human pose estimation","author":"li","year":"2021","journal-title":"Proc Adv Neural Informat Process Syst"},{"key":"ref1","author":"rosenblatt","year":"1957","journal-title":"The Perceptron A perceiving and recognizing automaton Project PARA"},{"key":"ref145","article-title":"DETR for pedestrian detection","author":"lin","year":"2020"},{"key":"ref241","first-page":"19353","article-title":"Model rubik's cube: Twisting resolution, depth and width for tinynets","author":"han","year":"2020","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref242","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541967"},{"key":"ref243","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2019.8875654"},{"key":"ref244","first-page":"4651","article-title":"Perceiver: General perception with iterative attention","author":"jaegle","year":"2021","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref240","first-page":"1580","article-title":"More features from cheap operations","author":"han","year":"0","journal-title":"Proc Conf Comput Vis Pattern Recognit"},{"key":"ref245","article-title":"Perceiver IO: A general architecture for structured inputs & outputs","author":"jaegle","year":"2021"},{"key":"ref109","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"radford","year":"2019","journal-title":"OpenAIRE blog"},{"key":"ref108","article-title":"BEiT:BERT pre-training of image transformers","author":"bao","year":"2021"},{"key":"ref107","article-title":"Masked self-supervised transformer for visual representation","author":"li","year":"0","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.278"},{"key":"ref105","article-title":"Conditional image generation with pixelCNN decoders","author":"oord","year":"2016"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1145\/1390156.1390294"},{"key":"ref103","first-page":"3","article-title":"Autoencoders, minimum description length, and helmholtz free energy","author":"hinton","year":"1994","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref102","article-title":"Early convolutions help transformers see better","author":"xiao","year":"2021","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref111","article-title":"Efficient self-supervised vision transformers for representation learning","author":"li","year":"2021"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"ref110","article-title":"Self-supervised learning with swin transformers","author":"xie","year":"2021"},{"key":"ref10","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"0","journal-title":"Proc Conf North Amer Chapter Assoc Comput Linguistics-Hum Lang Technol"},{"key":"ref11","first-page":"1877","article-title":"Language models are few-shot learners","author":"brown","year":"0","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"ref14","first-page":"1691","article-title":"Generative pretraining from pixels","author":"chen","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref15","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref16","first-page":"213","article-title":"End-to-end object detection with transformers","author":"carion","year":"0","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58604-1_20"},{"key":"ref17","article-title":"Deformable DETR: Deformable transformers for end-to-end object detection","author":"zhu","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01159"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01212"},{"key":"ref119","first-page":"13564","article-title":"Bridging visual representations for object detection via transformer decoder","author":"chi","year":"2020","journal-title":"Proc Conf Neural Informat Process Syst"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3082763"},{"key":"ref113","article-title":"Toward transformer-based object detection","author":"beal","year":"2020"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00374"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00738"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00359"},{"key":"ref121","article-title":"End-to-end object detection with adaptive clustering transformer","author":"zheng","year":"0","journal-title":"Proc Brit Mach Vis Assoc"},{"key":"ref122","article-title":"Oriented object detection with transformer","author":"ma","year":"2021"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00360"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/9970415\/09716741.pdf?arnumber=9716741","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,19]],"date-time":"2025-03-19T18:41:07Z","timestamp":1742409667000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9716741\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,1,1]]},"references-count":245,"journal-issue":{"issue":"1"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2022.3152247","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,1,1]]}}}