{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T04:34:37Z","timestamp":1780634077400,"version":"3.54.1"},"reference-count":123,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"10","license":[{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key R&amp;D Program of China","award":["2022ZD0160100"],"award-info":[{"award-number":["2022ZD0160100"]}]},{"name":"National Key R&amp;D Program of China","award":["2022ZD0160505"],"award-info":[{"award-number":["2022ZD0160505"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62272450"],"award-info":[{"award-number":["62272450"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shenzhen Research Program","award":["RCJC20200714114557087"],"award-info":[{"award-number":["RCJC20200714114557087"]}]},{"DOI":"10.13039\/501100004739","name":"Youth Innovation Promotion Association of the Chinese Academy of Sciences","doi-asserted-by":"publisher","award":["2020355"],"award-info":[{"award-number":["2020355"]}],"id":[{"id":"10.13039\/501100004739","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2023,10]]},"DOI":"10.1109\/tpami.2023.3282631","type":"journal-article","created":{"date-parts":[[2023,6,5]],"date-time":"2023-06-05T17:49:48Z","timestamp":1685987388000},"page":"12581-12600","source":"Crossref","is-referenced-by-count":454,"title":["UniFormer: Unifying Convolution and Self-Attention for Visual Recognition"],"prefix":"10.1109","volume":"45","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5612-0341","authenticated-orcid":false,"given":"Kunchang","family":"Li","sequence":"first","affiliation":[{"name":"ShenZhen Key Lab of Computer Vision and Pattern Recognition, Shenzhen Institute of Advanced Technology, Chinese Academy of Sciences, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2999-7428","authenticated-orcid":false,"given":"Yali","family":"Wang","sequence":"additional","affiliation":[{"name":"ShenZhen Key Lab of Computer Vision and Pattern Recognition, Shenzhen Institute of Advanced Technology, Chinese Academy of Sciences, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Junhao","family":"Zhang","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Peng","family":"Gao","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laborator, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Guanglu","family":"Song","sequence":"additional","affiliation":[{"name":"SenseTime Research, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yu","family":"Liu","sequence":"additional","affiliation":[{"name":"SenseTime Research, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2664-7975","authenticated-orcid":false,"given":"Hongsheng","family":"Li","sequence":"additional","affiliation":[{"name":"Chinese University of Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1889-2567","authenticated-orcid":false,"given":"Yu","family":"Qiao","sequence":"additional","affiliation":[{"name":"ShenZhen Key Lab of Computer Vision and Pattern Recognition, Shenzhen Institute of Advanced Technology, Chinese Academy of Sciences, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref57","article-title":"EVIT: Expediting vision transformers via token reorganizations","author":"liang","year":"2022","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00210"},{"key":"ref59","first-page":"740","article-title":"Microsoft COCO: Common objects in context","author":"lin","year":"2014","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00718"},{"key":"ref53","first-page":"13557","article-title":"VidTr: Video transformer without convolutions","author":"li","year":"2021","journal-title":"Proc IEEE Int Conf Comput Vis"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00117"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00099"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01112"},{"key":"ref51","article-title":"CT-Net: Channel tensorization network for video classification","author":"li","year":"2021","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00198"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00656"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2021.04.024"},{"key":"ref48","first-page":"1106","article-title":"ImageNet classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01576"},{"key":"ref42","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","author":"ioffe","year":"2015"},{"key":"ref41","article-title":"Shuffle transformer: Rethinking spatial shuffle for vision transformer","author":"huang","year":"2021"},{"key":"ref44","first-page":"18590","article-title":"All tokens matter: Token labeling for training better vision transformers","author":"jiang","year":"2021","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00209"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_21"},{"key":"ref8","first-page":"213","article-title":"End-to-end object detection with transformers","author":"carion","year":"2020","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref7","article-title":"Video super-resolution transformer","author":"cao","year":"2021"},{"key":"ref9","article-title":"A short note about Kinetics-600","author":"carreira","year":"2018"},{"key":"ref4","first-page":"1059","article-title":"High-performance large-scale image recognition without normalization","author":"brock","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref3","first-page":"813","article-title":"Is space-time attention all you need for video understanding?","author":"bertasius","year":"2021","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2956516"},{"key":"ref5","first-page":"19594","article-title":"Space-time mixing attention for video transformer","author":"bulat","year":"2021","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00009"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01204"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01474"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref31","article-title":"Container: Context aggregation network","author":"gao","year":"2021","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref30","first-page":"214","article-title":"Multi-modal transformer for video retrieval","author":"gabeur","year":"2020","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"ref32","article-title":"Accurate, large minibatch SGD: Training ImageNet in 1 hour","author":"goyal","year":"2017"},{"key":"ref39","article-title":"MobileNets: Efficient convolutional neural networks for mobile vision applications","author":"howard","year":"2017"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00140"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00585"},{"key":"ref23","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"2021","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref26","article-title":"PySlowFast","author":"fan","year":"2020"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00374"},{"key":"ref20","first-page":"3965","article-title":"CoAtNet: Marrying convolution and attention for all data sizes","author":"dai","year":"2021","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01181"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00520"},{"key":"ref12","article-title":"MMDetection: Open MMLab detection toolbox and benchmark","author":"chen","year":"2019"},{"key":"ref15","first-page":"9355","article-title":"Twins: Revisiting the design of spatial attention in vision transformers","author":"chu","year":"2021","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref14","article-title":"Per-pixel classification is not all you need for semantic segmentation","author":"cheng","year":"0","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00162"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00193"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01212"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-022-0274-8"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"ref17","article-title":"OpenMMLab pose estimation toolbox and benchmark","year":"2020"},{"key":"ref16","article-title":"Do we really need explicit position encodings for vision transformers?","author":"chu","year":"2021"},{"key":"ref19","article-title":"On the relationship between self-attention and convolutional layers","author":"cordonnier","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref18","article-title":"MMSegmentation: OpenMMLab semantic segmentation toolbox and benchmark","year":"2020"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00043"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.441"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2983686"},{"key":"ref91","article-title":"Attention is all you need","author":"vaswani","year":"0","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00565"},{"key":"ref86","first-page":"10347","article-title":"Training data-efficient image transformers & distillation through attention","author":"touvron","year":"2021","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref85","first-page":"10096","article-title":"EfficientNetV2: Smaller models and faster training","author":"tan","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00010"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.97"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01625"},{"key":"ref84","first-page":"6105","article-title":"EfficientNet: Rethinking model scaling for convolutional neural networks","author":"tan","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"ref80","article-title":"UCF101: A dataset of 101 human actions classes from videos in the wild","author":"soomro","year":"2012"},{"key":"ref79","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2014"},{"key":"ref108","article-title":"Focal self-attention for local-global interactions in vision transformers","author":"yang","year":"0","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref78","article-title":"An image is worth 16x16 words, what is a video worth?","author":"sharir","year":"2021","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref109","article-title":"TransPose: Towards explainable human pose estimation by transformer","author":"yang","year":"2020"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20202"},{"key":"ref75","first-page":"13937","article-title":"DynamicViT: Efficient vision transformers with dynamic token sparsification","author":"rao","year":"2021","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref104","article-title":"Early convolutions help transformers see better","author":"xiao","year":"0","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref74","first-page":"68","article-title":"Stand-alone self-attention in vision models","author":"ramachandran","year":"2019","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref105","first-page":"12077","article-title":"SegFormer: Simple and efficient design for semantic segmentation with transformers","author":"xie","year":"2021","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01228-7"},{"key":"ref102","first-page":"472","article-title":"Simple baselines for human pose estimation and tracking","author":"xiao","year":"2018","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_26"},{"key":"ref2","article-title":"Layer normalization","author":"ba","year":"2016"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.590"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"ref70","first-page":"12493","article-title":"Keeping your eye on the ball: Trajectory attention in video transformers","author":"patrick","year":"2021","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref112","first-page":"6575","article-title":"VOLO: Vision outlooker for visual recognition","volume":"45","author":"yuan","year":"2023","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01044"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01233"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00062"},{"key":"ref68","article-title":"MobileViT: Light-weight, general-purpose, and mobile-friendly vision transformer","author":"mehta","year":"2022","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00716"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_8"},{"key":"ref117","article-title":"mixup: Beyond empirical risk minimization","author":"zhang","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00355"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00299"},{"key":"ref64","article-title":"Fixing weight decay regularization in Adam","author":"loshchilov","year":"2017"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_14"},{"key":"ref63","article-title":"ConvTransformer: A convolutional transformer network for video frame synthesis","author":"liu","year":"2020"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00309"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00561"},{"key":"ref113","article-title":"HRFormer: High-resolution transformer for dense prediction","author":"yuan","year":"2021","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref65","article-title":"SGDR: Stochastic gradient descent with warm restarts","author":"loshchilov","year":"2017"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00612"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref122","article-title":"Deformable DETR: Deformable transformers for end-to-end object detection","author":"zhu","year":"2021","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20083-0_26"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6836"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-018-1140-0"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00252"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/10241246\/10143709.pdf?arnumber=10143709","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T17:29:51Z","timestamp":1717781391000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10143709\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10]]},"references-count":123,"journal-issue":{"issue":"10"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2023.3282631","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,10]]}}}