{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,18]],"date-time":"2026-06-18T22:00:31Z","timestamp":1781820031293,"version":"3.54.5"},"reference-count":114,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U21A20471"],"award-info":[{"award-number":["U21A20471"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U1911401"],"award-info":[{"award-number":["U1911401"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U1811461"],"award-info":[{"award-number":["U1811461"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Guangdong NSF","award":["2020B1515120085"],"award-info":[{"award-number":["2020B1515120085"]}]},{"name":"Guangdong NSF","award":["2018B030312002"],"award-info":[{"award-number":["2018B030312002"]}]},{"DOI":"10.13039\/100018919","name":"Peng Cheng Laboratory","doi-asserted-by":"publisher","award":["PCL2021A07"],"award-info":[{"award-number":["PCL2021A07"]}],"id":[{"id":"10.13039\/100018919","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Guangzhou Research","award":["201902010037"],"award-info":[{"award-number":["201902010037"]}]},{"name":"Key-Area Research and Development Program of Guangzhou","award":["202007030004"],"award-info":[{"award-number":["202007030004"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Multimedia"],"published-print":{"date-parts":[[2023]]},"DOI":"10.1109\/tmm.2023.3243616","type":"journal-article","created":{"date-parts":[[2023,2,9]],"date-time":"2023-02-09T18:32:29Z","timestamp":1675967549000},"page":"8906-8919","source":"Crossref","is-referenced-by-count":349,"title":["DilateFormer: Multi-Scale Dilated Transformer for Visual Recognition"],"prefix":"10.1109","volume":"25","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0507-2620","authenticated-orcid":false,"given":"Jiayu","family":"Jiao","sequence":"first","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-Sen University, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5472-0079","authenticated-orcid":false,"given":"Yu-Ming","family":"Tang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-Sen University, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0013-3730","authenticated-orcid":false,"given":"Kun-Yu","family":"Lin","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-Sen University, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yipeng","family":"Gao","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-Sen University, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0165-8416","authenticated-orcid":false,"given":"Andy J.","family":"Ma","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-Sen University, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2197-9038","authenticated-orcid":false,"given":"Yaowei","family":"Wang","sequence":"additional","affiliation":[{"name":"Pengcheng Laboratory, ShenZhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8327-0003","authenticated-orcid":false,"given":"Wei-Shi","family":"Zheng","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"ref2","first-page":"1","article-title":"Very deep convolutional networks for large-scale image recognition","volume-title":"Proc. 3rd Int. Conf. Learn. Representations","author":"Simonyan","year":"2015"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2016.90"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2015.7298594"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01167"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2022.3146775"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2020.3002614"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2016.2577031"},{"key":"ref9","article-title":"YOLOv3: An incremental improvement","author":"Redmon","year":"2018","journal-title":"arXiv:1804.02767"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/iccv.2017.324"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/iccv.2017.322"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"ref14","article-title":"Rethinking Atrous convolution for semantic image segmentation","author":"Chen","year":"2017","journal-title":"arXiv:1706.05587"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2015.7298965"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"ref17","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Brown","year":"2020"},{"key":"ref18","article-title":"Improving language understanding by generative pre-training","author":"Radford","year":"2018"},{"key":"ref19","first-page":"5998","article-title":"Attention is all you need","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Vaswani","year":"2017"},{"key":"ref20","article-title":"Longformer: The long-document transformer","author":"Beltagy","year":"2020","journal-title":"arXiv:2004.05150"},{"key":"ref21","first-page":"1","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"Proc. 9th Int. Conf. Learn. Representations","author":"Dosovitskiy","year":"2021"},{"key":"ref22","first-page":"10347","article-title":"Training data-efficient image transformers & distillation through attention","volume-title":"Proc. 38th Int. Conf. Mach. Learn.","author":"Touvron","year":"2021"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.00010"},{"key":"ref24","article-title":"Conditional positional encodings for vision transformers","author":"Chu","year":"2021","journal-title":"arXiv:2102.10882"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2021.3068576"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.00986"},{"key":"ref27","article-title":"Focal self-attention for local-global interactions in vision transformers","volume-title":"arXiv:2107.00641","author":"Yang","year":"2021"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.00299"},{"key":"ref29","first-page":"9355","article-title":"Twins: Revisiting the design of spatial attention in vision transformers","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Chu","year":"2021"},{"key":"ref30","article-title":"Neighborhood attention transformer","author":"Hassani","year":"2022","journal-title":"arXiv:2204.07143"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.00061"},{"key":"ref32","first-page":"18590","article-title":"All tokens matter: Token labeling for training better vision transformers","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Jiang","year":"2021"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2009.5206848"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2017.544"},{"key":"ref36","first-page":"12992","article-title":"Glance-and-gaze vision transformer","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Yu","year":"2021"},{"key":"ref37","article-title":"Shuffle transformer: Rethinking spatial shuffle for vision transformer","author":"Huang","year":"2021","journal-title":"arXiv:2106.03650"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20053-3_27"},{"key":"ref39","article-title":"CrossFormer: A versatile vision transformer based on cross-scale attention","author":"Wang","year":"2021","journal-title":"arXiv:2108.00154"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.00520"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.00042"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01058"},{"key":"ref43","first-page":"28522","article-title":"Vitae: Vision transformer advanced by exploring intrinsic inductive bias","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Xu","year":"2021"},{"key":"ref44","article-title":"UniFormer: Unifying convolution and self-attention for visual recognition","author":"Li","year":"2022","journal-title":"arXiv:2201.09450"},{"key":"ref45","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","volume-title":"Proc. Conf. North Amer. Chapter Assoc. Comput. Linguistics","author":"Devlin","year":"2019"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01179"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4899-7502-7_79-1"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/iccv.2019.00612"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2016.308"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00815"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7000"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.00988"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.00044"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2022.3206108"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20252"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2021.3109665"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2021.3096083"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2021.3074008"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2022.3164083"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01181"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.00360"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00729"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2020.2997192"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.00714"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.00707"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/iccvw54120.2021.00301"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2020.2991592"},{"key":"ref69","first-page":"90","article-title":"ResT-ReID: Transformer block-based residual learning for person re-identification","volume-title":"Pattern Recognit. Lett.","volume":"157","author":"Chen","year":"2022"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.01474"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1155\/2022\/8917964"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2021.3050082"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/icme52920.2022.9859907"},{"key":"ref74","first-page":"30392","article-title":"Early convolutions help transformers see better","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Xiao","year":"2021"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01186"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.00062"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01625"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2021.3086758"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01553"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.00943"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2021.3072479"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2021.3090274"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2021.3057493"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2021.3090206"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2021.3100766"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01055"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00041"},{"key":"ref89","first-page":"1","article-title":"Multi-scale context aggregation by dilated convolutions","volume-title":"Proc. 4th Int. Conf. Learn. Representations","author":"Yu","year":"2016"},{"key":"ref90","first-page":"2990","article-title":"Group equivariant convolutional networks","volume-title":"Proc. 33nd Int. Conf. Mach. Learn.","author":"Cohen","year":"2016"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2021.3050059"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2021.3086709"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01104"},{"key":"ref94","article-title":"Dilated neighborhood attention transformer","author":"Hassani","year":"2022","journal-title":"arXiv:2209.15001"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01044"},{"key":"ref96","first-page":"1","article-title":"On the connection between local attention and dynamic depth-wise convolution","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Han","year":"2022"},{"key":"ref97","first-page":"3965","article-title":"Coatnet: Marrying convolution and attention for all data sizes","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Dai","year":"2021"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00009"},{"key":"ref99","first-page":"15908","article-title":"Transformer in transformer","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Han","year":"2021"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20252"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"ref102","first-page":"6105","article-title":"EfficientNet: Rethinking model scaling for convolutional neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Tan","year":"2019"},{"key":"ref103","article-title":"Next-vit: Next generation vision transformer for efficient deployment in realistic industrial scenarios","author":"Li","year":"2022","journal-title":"arXiv:2207.05501"},{"key":"ref104","article-title":"DeepViT: Towards deeper vision transformer","author":"Zhou","year":"2021","journal-title":"arXiv:2103.11886"},{"key":"ref105","first-page":"1","article-title":"Decoupled weight decay regularization","volume-title":"Proc. 7th Int. Conf. Learn. Representations","author":"Loshchilov","year":"2019"},{"key":"ref106","article-title":"Trt-vit: TensorRT-oriented vision transformer","author":"Xia","year":"2022","journal-title":"arXiv:2205.09579"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2017.634"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2019.2956516"},{"key":"ref109","article-title":"MMDetection: Open MMLab detection toolbox and benchmark","author":"Chen","year":"2019","journal-title":"arXiv:1906.07155"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_26"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00656"},{"key":"ref112","article-title":"MMSegmentation: OpenMMLab semantic segmentation toolbox and benchmark","author":"Contributors","year":"2020"},{"key":"ref113","first-page":"20014","article-title":"XCiT: Cross-covariance image transformers","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Ali","year":"2021"},{"key":"ref114","article-title":"Pytorch library for cam methods","author":"Gildenblat","year":"2021"}],"container-title":["IEEE Transactions on Multimedia"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6046\/10016790\/10041780.pdf?arnumber=10041780","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,12]],"date-time":"2024-01-12T00:57:44Z","timestamp":1705021064000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10041780\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"references-count":114,"URL":"https:\/\/doi.org\/10.1109\/tmm.2023.3243616","relation":{},"ISSN":["1520-9210","1941-0077"],"issn-type":[{"value":"1520-9210","type":"print"},{"value":"1941-0077","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023]]}}}