{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T04:08:15Z","timestamp":1775275695699,"version":"3.50.1"},"reference-count":129,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"12","license":[{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key R&amp;D Program of China","award":["2022ZD0161000"],"award-info":[{"award-number":["2022ZD0161000"]}]},{"name":"General Research Fund of Hong Kong","award":["17200622"],"award-info":[{"award-number":["17200622"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1109\/tpami.2023.3303397","type":"journal-article","created":{"date-parts":[[2023,8,8]],"date-time":"2023-08-08T17:31:54Z","timestamp":1691515914000},"page":"14284-14300","source":"Crossref","is-referenced-by-count":98,"title":["CycleMLP: A MLP-Like Architecture for Dense Visual Predictions"],"prefix":"10.1109","volume":"45","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6126-2595","authenticated-orcid":false,"given":"Shoufa","family":"Chen","sequence":"first","affiliation":[{"name":"Department of Computer Science, University of Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6890-1049","authenticated-orcid":false,"given":"Enze","family":"Xie","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1142-9171","authenticated-orcid":false,"given":"Chongjian","family":"Ge","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0519-496X","authenticated-orcid":false,"given":"Runjian","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9774-4687","authenticated-orcid":false,"given":"Ding","family":"Liang","sequence":"additional","affiliation":[{"name":"SenseTime Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6685-7950","authenticated-orcid":false,"given":"Ping","family":"Luo","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Hong Kong, Hong Kong"}]}],"member":"263","reference":[{"key":"ref57","first-page":"740","article-title":"Microsoft COCO: Common objects in context","author":"lin","year":"2014","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"ref59","article-title":"Are we ready for a new paradigm shift? A survey on visual deep MLP","author":"liu","year":"2021"},{"key":"ref58","first-page":"9204","article-title":"Pay attention to MLPs","author":"liu","year":"2021","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref53","article-title":"AS-MLP: An axial shifted MLP architecture for vision","author":"lian","year":"2022","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01112"},{"key":"ref55","article-title":"Network in network","author":"lin","year":"2013","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00718"},{"key":"ref51","article-title":"GraphMLP: A graph MLP-like architecture for 3d human pose estimation","author":"li","year":"2022"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00198"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00656"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3007032"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1989.1.4.541"},{"key":"ref47","first-page":"1097","article-title":"ImageNet classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2918284"},{"key":"ref41","article-title":"MobileNets: Efficient convolutional neural networks for mobile vision applications","author":"howard","year":"2017"},{"key":"ref44","first-page":"646","article-title":"Deep networks with stochastic depth","author":"huang","year":"2016","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01206"},{"key":"ref8","article-title":"Rethinking atrous convolution for semantic image segmentation","author":"chen","year":"2017"},{"key":"ref7","article-title":"MMDetection: Open MMLab detection toolbox and benchmark","author":"chen","year":"2019"},{"key":"ref9","first-page":"801","article-title":"Encoder-decoder with atrous separable convolution for semantic image segmentation","author":"chen","year":"2018","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref4","first-page":"1877","article-title":"Language models are few-shot learners","author":"brown","year":"2020","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref3","first-page":"813","article-title":"Is space-time attention all you need for video understanding?","author":"bertasius","year":"2021","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref6","article-title":"MixerGAN: An MLP-based architecture for unpaired image-to-image translation","author":"cazenavette","year":"2021"},{"key":"ref5","first-page":"213","article-title":"End-to-end object detection with transformers","author":"carion","year":"2020","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00863"},{"key":"ref101","first-page":"22691","article-title":"DynaMixer: A vision MLP architecture with dynamic mixing","author":"wang","year":"2022","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3145427"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.123"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref37","article-title":"Benchmarking neural network robustness to common corruptions and perturbations","author":"hendrycks","year":"2019","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00090"},{"key":"ref30","first-page":"249","article-title":"Understanding the difficulty of training deep feedforward neural networks","author":"glorot","year":"2010","journal-title":"Proc 13th Int Conf Artif Intell Statist"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref32","article-title":"Beyond self-attention: External attention using two linear layers for visual tasks","author":"guo","year":"2021"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1016\/0893-6080(89)90020-8"},{"key":"ref38","article-title":"Gaussian error linear units (GELUs)","author":"hendrycks","year":"2016"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00374"},{"key":"ref23","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"2021","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00326"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"ref20","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2019","journal-title":"Proc Conf North Amer Chapter Assoc Comput Linguistics Hum Lang Technol"},{"key":"ref22","first-page":"2793","article-title":"Attention is not all you need: Pure attention loses rank doubly exponentially with depth","author":"dong","year":"2021","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00066"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00685"},{"key":"ref29","article-title":"PyTorch library for cam methods","year":"2021"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.2307\/2003354"},{"key":"ref12","article-title":"OpenMMLab pose estimation toolbox and benchmark","year":"2020"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.350"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.544"},{"key":"ref14","article-title":"On the relationship between self-attention and convolutional layers","author":"cordonnier","year":"2020","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref129","article-title":"Deformable {DETR}: Deformable transformers for end-to-end object detection","author":"zhu","year":"2021","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref97","article-title":"CrowdMLP: Weakly-supervised crowd counting via multi-granularity MLP","author":"wang","year":"2022"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2983686"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7000"},{"key":"ref11","article-title":"MMSegmentation: OpenMMLab semantic segmentation toolbox and benchmark","year":"2020"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"ref124","article-title":"Mixup: Beyond empirical risk minimization","author":"zhang","year":"2018","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref10","article-title":"PointMixer: MLP-mixer for point cloud understanding","author":"choe","year":"2021"},{"key":"ref98","article-title":"PVTv2: Improved baselines with pyramid vision transformer","author":"wang","year":"2021"},{"key":"ref125","article-title":"Mixing and shifting: Exploiting global and local dependencies in vision MLPs","author":"zheng","year":"2022"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1285"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00359"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref18","first-page":"7480","article-title":"Scaling vision transformers to 22 billion parameters","author":"dehghani","year":"2023","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00568"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00010"},{"key":"ref95","article-title":"GLUE: A multi-task benchmark and analysis platform for natural language understanding","author":"wang","year":"2019","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref94","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref91","article-title":"Training data-efficient image transformers & distillation through attention","author":"touvron","year":"2020"},{"key":"ref90","article-title":"ResMLP: Feedforward networks for image classification with data-efficient training","author":"touvron","year":"2021"},{"key":"ref89","first-page":"24261","article-title":"MLP-mixer: An all-MLP architecture for vision","author":"tolstikhin","year":"2021","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20133"},{"key":"ref85","first-page":"6105","article-title":"EfficientNet: Rethinking model scaling for convolutional neural networks","author":"tan","year":"2019","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref88","first-page":"3172","article-title":"RaftMLP: How much can be done without attention and with less spatial locality?","author":"tatsunami","year":"2022","journal-title":"Proc Asian Conf Comput Vis"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01066"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01422"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00584"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00584"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548025"},{"key":"ref108","first-page":"12077","article-title":"SegFormer: Simple and efficient design for semantic segmentation with transformers","author":"xie","year":"2021","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01625"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"ref106","article-title":"Skating-mixer: Multimodal MLP for scoring figure skating","author":"xia","year":"2022"},{"key":"ref107","first-page":"418","article-title":"Unified perceptual parsing for scene understanding","author":"xiao","year":"2018","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref75","first-page":"318","article-title":"Learning internal representations by error propagation","author":"rumelhart","year":"1986","journal-title":"Parallel Distributed Processing Explorations in the Microstructure of Cognition Vol 1 Foundations"},{"key":"ref104","article-title":"PyTorch image models","author":"wightman","year":"2019"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1037\/h0042519"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00009"},{"key":"ref77","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2015","journal-title":"Proc 3rd Int Conf Learn Representations"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP46576.2022.9897675"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01228-7"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25376"},{"key":"ref2","article-title":"Layer normalization","author":"ba","year":"2016"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"ref71","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"radford","year":"2021","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01159"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00307"},{"key":"ref112","article-title":"XLNet: Generalized autoregressive pretraining for language understanding","author":"yang","year":"2019","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref73","first-page":"980","article-title":"Global filter networks for image classification","author":"rao","year":"2021","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01044"},{"key":"ref110","article-title":"Speech-MLP: A simple MLP architecture for speech processing","author":"xing","year":"2022"},{"key":"ref68","first-page":"8024","article-title":"PyTorch: An imperative style, high-performance deep learning library","author":"paszke","year":"2019","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref119","first-page":"173","article-title":"Object-contextual representations for semantic segmentation","author":"yuan","year":"2020","journal-title":"Proc 16th Eur Conf Comput Vis"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/TMI.2022.3191974"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00367"},{"key":"ref69","first-page":"31","article-title":"SepMLP: An all-MLP architecture for music source separation","author":"qian","year":"2022","journal-title":"Proc 9th Conf Sound Music Technol Revised Sel Papers CMST"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"ref64","article-title":"Image-to-image MLP-mixer for image reconstruction","author":"mansour","year":"2022"},{"key":"ref115","article-title":"Multi-scale context aggregation by dilated convolutions","author":"yu","year":"2016","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref63","article-title":"Rethinking network design and local geometry in point cloud: A simple residual MLP framework","author":"ma","year":"2022","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref116","article-title":"S$^{2}$2-MLPv2: Improved spatial-shift MLP architecture for vision","author":"yu","year":"2021"},{"key":"ref66","article-title":"MLP architectures for vision-and-language modeling: An empirical study","author":"nie","year":"2021"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58555-6_12"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01173"},{"key":"ref114","article-title":"Multi-scale context aggregation by dilated convolutions","author":"yu","year":"2016","journal-title":"Proc 4th Int Conf Learn Representations"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref122","article-title":"Dynamic MLP for MRI reconstruction","author":"zhang","year":"2023"},{"key":"ref123","article-title":"MorphMLP: A self-attention free, MLP-like backbone for image and video","author":"zhang","year":"2021"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/BIBM55620.2022.9995348"},{"key":"ref120","first-page":"7281","article-title":"Hrformer: High-resolution transformer for dense prediction","volume":"34","author":"yuan","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref61","article-title":"Decoupled weight decay regularization","author":"loshchilov","year":"2019","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00612"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/10308548\/10210694.pdf?arnumber=10210694","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,27]],"date-time":"2023-11-27T19:50:27Z","timestamp":1701114627000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10210694\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12]]},"references-count":129,"journal-issue":{"issue":"12"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2023.3303397","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,12]]}}}