{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T15:32:10Z","timestamp":1775143930546,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key R&D Program of China","award":["2018AAA0101900"],"award-info":[{"award-number":["2018AAA0101900"]}]},{"name":"MoE Engineering Research Center of Digital Library"},{"name":"NSFC","award":["U19B2042, 62072399"],"award-info":[{"award-number":["U19B2042, 62072399"]}]},{"name":"Chinese Knowledge Center for Engineering Sciences and Technology"},{"name":"Alibaba-Zhejiang University Joint Institute of Frontier Technologies"},{"name":"the Fundamental Research Funds for the Central Universities"},{"name":"Alibaba Group through Alibaba Innovative Research Program"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475561","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T05:04:15Z","timestamp":1634533455000},"page":"4239-4248","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":120,"title":["RAMS-Trans: Recurrent Attention Multi-scale Transformer for Fine-grained Image Recognition"],"prefix":"10.1145","author":[{"given":"Yunqing","family":"Hu","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"given":"Xuan","family":"Jin","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"given":"Yin","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"given":"Haiwen","family":"Hong","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"given":"Jingfeng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"given":"Yuan","family":"He","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"given":"Hui","family":"Xue","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Jamie Ryan Kiros, and Geoffrey E. Hinton","author":"Ba Lei Jimmy","year":"2016"},{"key":"e_1_3_2_1_2_1","volume-title":"Serge J. Belongie, and Pietro Perona.","author":"Branson Steve","year":"2014"},{"key":"e_1_3_2_1_3_1","volume-title":"End-to-End Object Detection with Transformers. CoRR","author":"Carion Nicolas","year":"2020"},{"key":"e_1_3_2_1_4_1","volume-title":"Destruction and Construction Learning for Fine-Grained Image Recognition. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2019","author":"Chen Yue","year":"2019"},{"key":"e_1_3_2_1_5_1","volume-title":"Kernel Pooling for Convolutional Neural Networks. In 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017","author":"Cui Yin","year":"2017"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1285"},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019","volume":"1","author":"Devlin Jacob","year":"2019"},{"key":"e_1_3_2_1_8_1","volume-title":"Selective Sparse Sampling for Fine-Grained Image Recognition. In 2019 IEEE\/CVF International Conference on Computer Vision, ICCV 2019","author":"Ding Yao","year":"2019"},{"key":"e_1_3_2_1_9_1","volume-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. CoRR","author":"Dosovitskiy Alexey","year":"1929"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings, Part XX (Lecture Notes in Computer Science","volume":"168","author":"Du Ruoyi","year":"2020"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.5555\/3326943.3327002"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.476"},{"key":"e_1_3_2_1_13_1","volume-title":"Compact Bilinear Pooling. In 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016","author":"Gao Yang","year":"2016"},{"key":"e_1_3_2_1_14_1","volume-title":"Scott","author":"Gao Yu","year":"2020"},{"key":"e_1_3_2_1_15_1","volume-title":"Weakly Supervised Complementary Parts Models for Fine-Grained Image Classification From the Bottom Up. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2019","author":"Ge Weifeng","year":"2019"},{"key":"e_1_3_2_1_16_1","volume-title":"Yuille","author":"He Ju","year":"2021"},{"key":"e_1_3_2_1_17_1","volume-title":"Deep Residual Learning for Image Recognition. In 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016","author":"He Kaiming","year":"2016"},{"key":"e_1_3_2_1_18_1","volume-title":"The INaturalist Species Classification and Detection Dataset. In 2018 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2018","author":"Horn Grant Van","year":"2018"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00869"},{"key":"e_1_3_2_1_20_1","volume-title":"Torr","author":"Jetley Saumya","year":"2018"},{"key":"e_1_3_2_1_21_1","volume-title":"Attention Convolutional Binary Neural Tree for Fine-Grained Visual Categorization. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2020","author":"Ji Ruyi","year":"2020"},{"key":"e_1_3_2_1_22_1","volume-title":"Proc. CVPR Workshop on Fine-Grained Visual Categorization (FGVC)","volume":"2","author":"Khosla Aditya","year":"2011"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.688"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.170"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6822"},{"key":"e_1_3_2_1_26_1","volume-title":"Cross-X Learning for Fine-Grained Visual Categorization. In 2019 IEEE\/CVF International Conference on Computer Vision, ICCV 2019","author":"Luo Wei","year":"2019"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2020.3020227"},{"key":"e_1_3_2_1_28_1","volume-title":"Munich","volume":"67","author":"Recasens Adri\u00e0","year":"2018"},{"key":"e_1_3_2_1_29_1","volume-title":"3rd International Conference on Learning Representations, ICLR","author":"Simonyan Karen","year":"2015"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.5555\/3298023.3298188"},{"key":"e_1_3_2_1_31_1","volume-title":"Training data-efficient image transformers & distillation through attention. CoRR","author":"Touvron Hugo","year":"2020"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_1_33_1","unstructured":"Catherine Wah Steve Branson Peter Welinder Pietro Perona and Serge Belongie. 2011. The caltech-ucsd birds-200--2011 dataset. (2011).  Catherine Wah Steve Branson Peter Welinder Pietro Perona and Serge Belongie. 2011. The caltech-ucsd birds-200--2011 dataset. (2011)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00436"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2688133"},{"key":"e_1_3_2_1_36_1","volume-title":"Mask-CNN: Localizing Parts and Selecting Descriptors for Fine-Grained Image Recognition. CoRR","author":"Wei Xiu-Shen","year":"2016"},{"key":"e_1_3_2_1_37_1","volume-title":"Munich","volume":"380","author":"Wei Xing","year":"2018"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3454804"},{"key":"e_1_3_2_1_39_1","volume-title":"Munich","volume":"454","author":"Yang Ze","year":"2018"},{"key":"e_1_3_2_1_40_1","volume-title":"MMM 2021, Prague, Czech Republic, June 22-24, 2021, Proceedings, Part I (Lecture Notes in Computer Science","volume":"147","author":"Zhang Fan","year":"2021"},{"key":"e_1_3_2_1_41_1","volume-title":"Learning Multi-attention Convolutional Neural Network for Fine-Grained Image Recognition. In IEEE International Conference on Computer Vision, ICCV 2017","author":"Zheng Heliang","year":"2017"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3454672"},{"key":"e_1_3_2_1_43_1","volume-title":"Looking for the Devil in the Details: Learning Trilinear Attention Sampling Network for Fine-Grained Image Recognition. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2019","author":"Zheng Heliang","year":"2019"},{"key":"e_1_3_2_1_44_1","volume-title":"Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers. CoRR","author":"Zheng Sixiao","year":"2020"},{"key":"e_1_3_2_1_45_1","volume-title":"Learning Deep Features for Discriminative Localization. In 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016","author":"Zhou Bolei","year":"2016"},{"key":"e_1_3_2_1_46_1","volume-title":"Deformable DETR: Deformable Transformers for End-to-End Object Detection. CoRR","author":"Zhu Xizhou","year":"2020"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7016"}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475561","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475561","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:49:10Z","timestamp":1750193350000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475561"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":47,"alternative-id":["10.1145\/3474085.3475561","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475561","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}