{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T18:09:32Z","timestamp":1775066972367,"version":"3.50.1"},"reference-count":115,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2022]]},"DOI":"10.1109\/tpami.2022.3211006","type":"journal-article","created":{"date-parts":[[2022,10,5]],"date-time":"2022-10-05T19:36:24Z","timestamp":1664998584000},"page":"1-13","source":"Crossref","is-referenced-by-count":366,"title":["Beyond Self-Attention: External Attention Using Two Linear Layers for Visual Tasks"],"prefix":"10.1109","author":[{"given":"Meng-Hao","family":"Guo","sequence":"first","affiliation":[{"name":"BNRist, Department of Computer Science and Technology, Tsinghua University, Beijing, China"}]},{"given":"Zheng-Ning","family":"Liu","sequence":"additional","affiliation":[{"name":"BNRist, Department of Computer Science and Technology, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9197-346X","authenticated-orcid":false,"given":"Tai-Jiang","family":"Mu","sequence":"additional","affiliation":[{"name":"BNRist, Department of Computer Science and Technology, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7507-6542","authenticated-orcid":false,"given":"Shi-Min","family":"Hu","sequence":"additional","affiliation":[{"name":"BNRist, Department of Computer Science and Technology, Tsinghua University, Beijing, China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Neural machine translation by jointly learning to align and translate","author":"Bahdanau"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00326"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00069"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00926"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1909.11065"},{"key":"ref8","article-title":"Is attention better than matrix decomposition?","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Geng"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1038\/381607a0"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TSP.2006.881199"},{"key":"ref11","first-page":"7354","article-title":"Self-attention generative adversarial networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Zhang"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"ref13","first-page":"2204","article-title":"Recurrent models of visual attention","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Mnih"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.5555\/3045118.3045336"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2913372"},{"key":"ref16","first-page":"5998","article-title":"Attention is all you need","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Vaswani"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01465-9"},{"key":"ref18","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00378"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00484"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-022-0271-y"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00338"},{"key":"ref23","first-page":"68","article-title":"Stand-alone self-attention in vision models","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Parmar"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01009"},{"key":"ref25","article-title":"A structured self-attentive sentence embedding","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Lin"},{"key":"ref26","first-page":"5754","article-title":"XLNet: Generalized autoregressive pretraining for language understanding","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Yang"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1285"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1093\/bioinformatics\/btz682"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref30","first-page":"1691","article-title":"Generative pretraining from pixels","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Chen"},{"key":"ref31","first-page":"10347","article-title":"Training data-efficient image transformers & distillation through attention","author":"Touvron","year":"2021","journal-title":"Proc. Int. Conf. Mach. Learn."},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"ref34","article-title":"Deformable DETR: Deformable transformers for end-to-end object detection","author":"Zhu","year":"2021","journal-title":"Proc. Int. Conf. Learn. Representations"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01212"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00803"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00863"},{"key":"ref39","first-page":"14745","article-title":"Transgan: Two pure transformers can make one strong gan, and that can scale up","volume":"34","author":"Jiang","year":"2021","journal-title":"Proc. Adv. Neural Inform. Process. Syst."},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00147"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01474"},{"key":"ref42","article-title":"CPTR: Full transformer network for image captioning","author":"Liu","year":"2021"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-021-0229-5"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3152247"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3505244"},{"key":"ref47","first-page":"249","article-title":"Understanding the difficulty of training deep feedforward neural networks","volume-title":"Proc. 13th Int. Conf. Artif. Intell. Statist.","author":"Glorot"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2572683"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-022-0274-8"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01044"},{"key":"ref55","first-page":"15908","article-title":"Transformer in transformer","volume":"34","author":"Han","year":"2021","journal-title":"Proc. Adv. Neural Inform. Process. Syst."},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00009"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00983"},{"key":"ref58","first-page":"9355","article-title":"Twins: Revisiting the design of spatial attention in vision transformers","volume":"34","author":"Chu","year":"2021","journal-title":"Proc. Adv. Neural Inform. Process. Syst."},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00978"},{"key":"ref60","article-title":"Generalized focal loss: Learning qualified and distributed bounding boxes for dense object detection","volume":"33","author":"Li","year":"2020","journal-title":"Proc. Adv. Neural Inform. Process. Syst."},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01422"},{"key":"ref62","article-title":"Mmdetection: Open mmlab detection toolbox and benchmark","author":"Chen","year":"2019"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2844175"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2858826"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2956516"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.660"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00199"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00747"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01308"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00064"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00897"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01240-3_17"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00068"},{"key":"ref75","article-title":"MMSegmentation: Openmmlab semantic segmentation toolbox and benchmark","author":"Contributors","year":"2020"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00770"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00366"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00246"},{"key":"ref79","article-title":"Unsupervised representation learning with deep convolutional generative adversarial networks","author":"Radford"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.304"},{"key":"ref81","article-title":"Improving the improved training of wasserstein GANs","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Wei"},{"key":"ref82","article-title":"cGANs with projection discriminator","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Miyato"},{"key":"ref83","first-page":"77","article-title":"PointNet: Deep learning on point sets for 3D classification and segmentation","volume-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","author":"Qi"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.99"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00979"},{"key":"ref86","first-page":"5099","article-title":"PointNet++: Deep hierarchical feature learning on point sets in a metric space","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Qi"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201301"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1145\/3326362"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018778"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00985"},{"key":"ref91","first-page":"828","article-title":"PointCNN: Convolution on X-transformed points","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Li"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00563"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00910"},{"key":"ref94","first-page":"350","article-title":"A${}^{{2}}$2-Nets: Double attention networks","author":"Chen","year":"2018","journal-title":"Proc. Adv. Neural Inform. Process. Syst."},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00366"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00064"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00959"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00571"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00760"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00651"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-020-3097-4"},{"key":"ref102","first-page":"8024","article-title":"PyTorch: An imperative style, high-performance deep learning library","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Paszke"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-009-0275-4"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref105","article-title":"Rethinking attention with performers","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Choromanski"},{"key":"ref106","article-title":"Layer normalization","author":"Ba","year":"2016"},{"key":"ref107","first-page":"448","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Ioffe"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-018-1140-0"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.350"},{"key":"ref111","first-page":"21357","article-title":"ContraGAN: Contrastive learning for conditional image generation","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Kang"},{"key":"ref112","first-page":"6626","article-title":"GANs trained by a two time-scale update rule converge to a local nash equilibrium","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Heusel"},{"key":"ref113","first-page":"2226","article-title":"Improved techniques for training GANs","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Salimans"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298801"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1145\/2980179.2980238"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/4359286\/09912362.pdf?arnumber=9912362","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,22]],"date-time":"2024-01-22T21:56:53Z","timestamp":1705960613000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9912362\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"references-count":115,"URL":"https:\/\/doi.org\/10.1109\/tpami.2022.3211006","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]}}}