{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T20:46:26Z","timestamp":1763239586464,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T00:00:00Z","timestamp":1665360000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61976049, 62072080 and U20B2063"],"award-info":[{"award-number":["61976049, 62072080 and U20B2063"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Sichuan Science and Technology Program","award":["2019ZDZX0008"],"award-info":[{"award-number":["2019ZDZX0008"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,10,10]]},"DOI":"10.1145\/3503161.3548383","type":"proceedings-article","created":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T15:43:12Z","timestamp":1665416592000},"page":"3811-3819","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["MAVT-FG"],"prefix":"10.1145","author":[{"given":"Xiaoyu","family":"Zhou","sequence":"first","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaotong","family":"Song","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hao","family":"Wu","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jingran","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xing","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2022,10,10]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00407"},{"key":"e_1_3_2_2_2_1","volume-title":"Soundnet: Learning sound representations from unlabeled video. Advances in neural information processing systems","author":"Aytar Yusuf","year":"2016","unstructured":"Yusuf Aytar , Carl Vondrick , and Antonio Torralba . 2016 . Soundnet: Learning sound representations from unlabeled video. Advances in neural information processing systems , Vol. 29 (2016). Yusuf Aytar, Carl Vondrick, and Antonio Torralba. 2016. Soundnet: Learning sound representations from unlabeled video. Advances in neural information processing systems, Vol. 29 (2016)."},{"key":"e_1_3_2_2_3_1","unstructured":"birder.cn. 2017. birder. http:\/\/www.birder.cn\/video.html.  birder.cn. 2017. birder. http:\/\/www.birder.cn\/video.html."},{"key":"e_1_3_2_2_4_1","unstructured":"Birdsdata.com. 2020. Birdsdata. https:\/\/open.baai.ac.cn\/data-set-detail\/.  Birdsdata.com. 2020. Birdsdata. https:\/\/open.baai.ac.cn\/data-set-detail\/."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/2339530.2339616"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/tip.2020.2973812"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00694"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413869"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_10"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6712"},{"key":"e_1_3_2_2_11_1","volume-title":"Ast: Audio spectrogram transformer. arXiv preprint arXiv:2104.01778","author":"Gong Yuan","year":"2021","unstructured":"Yuan Gong , Yu-An Chung , and James Glass . 2021 . Ast: Audio spectrogram transformer. arXiv preprint arXiv:2104.01778 (2021). Yuan Gong, Yu-An Chung, and James Glass. 2021. Ast: Audio spectrogram transformer. arXiv preprint arXiv:2104.01778 (2021)."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475241"},{"key":"e_1_3_2_2_13_1","volume-title":"TransFG: A Transformer Architecture for Fine-grained Recognition. arXiv","author":"He Ju","year":"2021","unstructured":"Ju He and Adam Kortylewski Cheng Yang Yutong Bai Changhu Wang Alan Yuille Jieneng Chen , Shuai Liu . 2021. TransFG: A Transformer Architecture for Fine-grained Recognition. arXiv ( 2021 ). Ju He and Adam Kortylewski Cheng Yang Yutong Bai Changhu Wang Alan Yuille Jieneng Chen, Shuai Liu. 2021. TransFG: A Transformer Architecture for Fine-grained Recognition. arXiv (2021)."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350974"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cub.2005.08.058"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475561"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.3390\/s20010183"},{"key":"e_1_3_2_2_18_1","volume-title":"Proceedings of the Asian Conference on Computer Vision.","author":"Lin Yan-Bo","year":"2020","unstructured":"Yan-Bo Lin and Yu-Chiang Frank Wang . 2020 . Audiovisual transformer with instance attention for audio-visual event localization . In Proceedings of the Asian Conference on Computer Vision. Yan-Bo Lin and Yu-Chiang Frank Wang. 2020. Audiovisual transformer with instance attention for audio-visual event localization. In Proceedings of the Asian Conference on Computer Vision."},{"key":"e_1_3_2_2_19_1","volume-title":"Transformer with peak suppression and knowledge guidance for fine-grained image recognition. arXiv preprint arXiv:2107.06538","author":"Liu Xinda","year":"2021","unstructured":"Xinda Liu , Lili Wang , and Xiaoguang Han . 2021. Transformer with peak suppression and knowledge guidance for fine-grained image recognition. arXiv preprint arXiv:2107.06538 ( 2021 ). Xinda Liu, Lili Wang, and Xiaoguang Han. 2021. Transformer with peak suppression and knowledge guidance for fine-grained image recognition. arXiv preprint arXiv:2107.06538 (2021)."},{"key":"e_1_3_2_2_20_1","unstructured":"ManyBirds and Malcolm Mark Swan. 2006. Manybirds. https:\/\/www.manybirds.com\/.  ManyBirds and Malcolm Mark Swan. 2006. Manybirds. https:\/\/www.manybirds.com\/."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01463"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01229"},{"key":"e_1_3_2_2_23_1","unstructured":"Cornell Lab of Ornithology. 2022. allaboutbirds. https:\/\/www.aboutbirds.org\/.  Cornell Lab of Ornithology. 2022. allaboutbirds. https:\/\/www.aboutbirds.org\/."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00713"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3414053"},{"key":"e_1_3_2_2_26_1","volume-title":"The Pytorch-kaldi Speech Recognition Toolkit. In ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 6465--6469","author":"Ravanelli Mirco","year":"2019","unstructured":"Mirco Ravanelli , Titouan Parcollet , and Yoshua Bengio . 2019 . The Pytorch-kaldi Speech Recognition Toolkit. In ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 6465--6469 . https:\/\/doi.org\/10.1109\/ICASSP.2019.8683713 Mirco Ravanelli, Titouan Parcollet, and Yoshua Bengio. 2019. The Pytorch-kaldi Speech Recognition Toolkit. In ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 6465--6469. https:\/\/doi.org\/10.1109\/ICASSP.2019.8683713"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICONAT53423.2022.9725906"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413638"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"crossref","unstructured":"Z. Sun Y. Yao X. S. Wei Y. Zhang F. Shen J. Wu J. Zhang and H. T. Shen. 2021. Webly Supervised Fine-Grained Recognition: Benchmark Datasets and An Approach. (2021).  Z. Sun Y. Yao X. S. Wei Y. Zhang F. Shen J. Wu J. Zhang and H. T. Shen. 2021. Webly Supervised Fine-Grained Recognition: Benchmark Datasets and An Approach. (2021).","DOI":"10.1109\/ICCV48922.2021.01043"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475587"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298658"},{"key":"e_1_3_2_2_32_1","volume-title":"arXiv","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani , Noam Shazeer , Niki Parmar , Jakob Uszkoreit , Llion Jones , Aidan N Gomez , Lukasz Kaiser , and Illia Polosukhin . 2017. Attention Is All You Need. arXiv ( 2017 ). Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention Is All You Need. arXiv (2017)."},{"key":"e_1_3_2_2_33_1","unstructured":"Catherine Wah Steve Branson Peter Welinder Pietro Perona and Serge Belongie. 2011. The caltech-ucsd birds-200--2011 dataset. (2011).  Catherine Wah Steve Branson Peter Welinder Pietro Perona and Serge Belongie. 2011. The caltech-ucsd birds-200--2011 dataset. (2011)."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475216"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413871"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01271"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413783"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2020.3009004"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3045530"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.2967597"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413851"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16447"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00397"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019259"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01179"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7016"}],"event":{"name":"MM '22: The 30th ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Lisboa Portugal","acronym":"MM '22"},"container-title":["Proceedings of the 30th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548383","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3503161.3548383","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:00:44Z","timestamp":1750186844000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548383"}},"subtitle":["Multimodal Audio-Visual Transformer for Weakly-supervised Fine-Grained Recognition"],"short-title":[],"issued":{"date-parts":[[2022,10,10]]},"references-count":46,"alternative-id":["10.1145\/3503161.3548383","10.1145\/3503161"],"URL":"https:\/\/doi.org\/10.1145\/3503161.3548383","relation":{},"subject":[],"published":{"date-parts":[[2022,10,10]]},"assertion":[{"value":"2022-10-10","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}