{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T19:18:23Z","timestamp":1778786303176,"version":"3.51.4"},"reference-count":73,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,10,1]],"date-time":"2021-10-01T00:00:00Z","timestamp":1633046400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,10,1]],"date-time":"2021-10-01T00:00:00Z","timestamp":1633046400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,10]]},"DOI":"10.1109\/iccv48922.2021.01205","type":"proceedings-article","created":{"date-parts":[[2022,2,28]],"date-time":"2022-02-28T22:08:02Z","timestamp":1646086082000},"page":"12250-12260","source":"Crossref","is-referenced-by-count":268,"title":["AutoFormer: Searching Transformers for Visual Recognition"],"prefix":"10.1109","author":[{"given":"Minghao","family":"Chen","sequence":"first","affiliation":[{"name":"Stony Brook University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Houwen","family":"Peng","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jianlong","family":"Fu","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haibin","family":"Ling","sequence":"additional","affiliation":[{"name":"Stony Brook University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00907"},{"key":"ref72","article-title":"Neural architecture search with reinforcement learning","author":"zoph","year":"2016","journal-title":"ICLRE"},{"key":"ref71","article-title":"Go wide, then narrow: Efficient training of deep thin networks","author":"zhou","year":"2020","journal-title":"ICML"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01009"},{"key":"ref39","article-title":"Cream of the crop: Distilling prioritized paths for one-shot neural architecture search","author":"peng","year":"2020","journal-title":"NeurIPS"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"ref33","article-title":"Visualizing the loss landscape of neural nets","author":"li","year":"2017","journal-title":"NeurIPS"},{"key":"ref32","article-title":"Boss-nas: Exploring hybrid cnn-transformers with block-wisely self-supervised neural architecture search","author":"li","year":"2021"},{"key":"ref31","article-title":"Blockwisely supervised neural architecture search with knowledge distillation","author":"li","year":"2020","journal-title":"CVPR"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00206"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"ref36","article-title":"Are sixteen heads really better than one?","author":"michel","year":"2019","journal-title":"NeurIPS"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00915"},{"key":"ref34","article-title":"Random search and reproducibility for neural architecture search","author":"li","year":"2019","journal-title":"UAI"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00009"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.154"},{"key":"ref63","article-title":"Co-scale convattentional image transformers","author":"xu","year":"2021"},{"key":"ref28","article-title":"Albert: A lite bert for self-supervised learning of language representations","author":"lan","year":"2020","journal-title":"ICLRE"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00189"},{"key":"ref27","article-title":"Learning multiple layers of features from tiny images","author":"krizhevsky","year":"2009"},{"key":"ref65","article-title":"Bignas: Scaling up neural architecture search with big single-stage models","author":"yu","year":"2020","journal-title":"NeurIPS"},{"key":"ref66","article-title":"Slimmable neural networks","author":"yu","year":"2019","journal-title":"ICLRE"},{"key":"ref29","article-title":"Deeply-supervised nets","author":"lee","year":"2015","journal-title":"AIS-TATS"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00612"},{"key":"ref69","article-title":"mixup: Beyond empirical risk minimization","author":"zhang","year":"2018","journal-title":"ICLRE"},{"key":"ref2","article-title":"Layer normalization","author":"ba","year":"2016","journal-title":"NeurIPS"},{"key":"ref1","article-title":"A convergence theory for deep learning via over-parameterization","author":"allen-zhu","year":"2019","journal-title":"ICML"},{"key":"ref20","article-title":"Augment your batch: better training with larger batches","author":"hoffer","year":"2020","journal-title":"CVPR"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00140"},{"key":"ref24","article-title":"Ccnet: Crisscross attention for semantic segmentation","author":"huang","year":"2019","journal-title":"ICCV"},{"key":"ref23","article-title":"Deep networks with stochastic depth","author":"huang","year":"2016","journal-title":"ECCV"},{"key":"ref26","article-title":"Collecting a large-scale dataset of fine-grained cars","author":"krause","year":"2013"},{"key":"ref25","article-title":"Transformers in vision: A survey","author":"khan","year":"2021"},{"key":"ref50","article-title":"Training data-efficient image transformers & distillation through attention","author":"touvron","year":"2021","journal-title":"ICML"},{"key":"ref51","article-title":"Grafit: Learning fine-grained image representations with coarse labels","author":"touvron","year":"2020"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01099"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1176"},{"key":"ref56","article-title":"Axial-deeplab: Stand-alone axial-attention for panoptic segmentation","author":"wang","year":"2020","journal-title":"ECCV"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.686"},{"key":"ref54","article-title":"Regularization of neural networks using drop-connect","author":"wan","year":"2013","journal-title":"ICML"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1580"},{"key":"ref52","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"NeurIPS"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00359"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref40","article-title":"Efficient neural architecture search via parameters sharing","author":"pham","year":"2018","journal-title":"ICML"},{"key":"ref12","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2019","journal-title":"NAACL"},{"key":"ref13","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"2021","journal-title":"ICLRE"},{"key":"ref14","article-title":"Gradient descent provably optimizes over-parameterized neural networks","author":"du","year":"2018","journal-title":"ICLRE"},{"key":"ref15","article-title":"Neural architecture search: A survey","author":"elsken","year":"2019","journal-title":"JMLR"},{"key":"ref16","article-title":"Single path one-shot neural architecture search with uniform sampling","author":"guo","year":"2020","journal-title":"ECCV"},{"key":"ref17","article-title":"A survey on visual transformer","author":"han","year":"2020"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref19","article-title":"Gaussian error linear units (gelus)","author":"hendrycks","year":"2016"},{"key":"ref4","article-title":"Understanding and simplifying one-shot architecture search","author":"bender","year":"2018","journal-title":"ICML"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1338"},{"key":"ref6","article-title":"Once for all: Train one network and specialize it for efficient deployment","author":"cai","year":"2020","journal-title":"ICLRE"},{"key":"ref5","article-title":"Smash: one-shot model architecture search through hypernetworks","author":"brock","year":"2018","journal-title":"ICLRE"},{"key":"ref8","article-title":"Fair-nas: Rethinking evaluation fairness of weight sharing neural architecture search","author":"chu","year":"2019"},{"key":"ref7","article-title":"End-to-end object detection with transformers","author":"carion","year":"2020","journal-title":"ECCV"},{"key":"ref49","article-title":"Efficientnet: Rethinking model scaling for convolutional neural networks","author":"tan","year":"2019","journal-title":"ICML"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W19-4828"},{"key":"ref46","article-title":"Bottle-neck transformers for visual recognition","author":"srinivas","year":"2021"},{"key":"ref45","article-title":"The evolved transformer","author":"so","year":"2019","journal-title":"International Conference on Machine Learning"},{"key":"ref48","article-title":"Mnas-net: Platform-aware neural architecture search for mobile","author":"tan","year":"2019","journal-title":"CVPR"},{"key":"ref47","article-title":"Dropout: a simple way to prevent neural networks from overfitting","author":"srivastava","year":"2014","journal-title":"JMLR"},{"key":"ref42","article-title":"Stand-alone self-attention in vision models","author":"ramachandran","year":"2019"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01044"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33014780"}],"event":{"name":"2021 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Montreal, QC, Canada","start":{"date-parts":[[2021,10,10]]},"end":{"date-parts":[[2021,10,17]]}},"container-title":["2021 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9709627\/9709628\/09711132.pdf?arnumber=9711132","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,7,14]],"date-time":"2022-07-14T19:45:22Z","timestamp":1657827922000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9711132\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10]]},"references-count":73,"URL":"https:\/\/doi.org\/10.1109\/iccv48922.2021.01205","relation":{},"subject":[],"published":{"date-parts":[[2021,10]]}}}