{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T17:54:35Z","timestamp":1776880475120,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Major Technological Innovation Project of Hangzhou","award":["2022AIZD0147"],"award-info":[{"award-number":["2022AIZD0147"]}]},{"name":"Major Scientific Research Project of Zhejiang Lab","award":["2020ND8AD01"],"award-info":[{"award-number":["2020ND8AD01"]}]},{"name":"National Key Research and Development Project","award":["2022YFC2504605"],"award-info":[{"award-number":["2022YFC2504605"]}]},{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["LZ22F020012"],"award-info":[{"award-number":["LZ22F020012"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612159","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"4768-4777","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["HSVLT: Hierarchical Scale-Aware Vision-Language Transformer for Multi-Label Image Classification"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4507-4153","authenticated-orcid":false,"given":"Shuyi","family":"Ouyang","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2336-1496","authenticated-orcid":false,"given":"Hongyi","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0171-5158","authenticated-orcid":false,"given":"Ziwei","family":"Niu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3969-3007","authenticated-orcid":false,"given":"Zhenjia","family":"Bai","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5106-5512","authenticated-orcid":false,"given":"Shiao","family":"Xie","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1217-0448","authenticated-orcid":false,"given":"Yingying","family":"Xu","sequence":"additional","affiliation":[{"name":"Zhejiang Lab, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8167-5354","authenticated-orcid":false,"given":"Ruofeng","family":"Tong","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5952-0188","authenticated-orcid":false,"given":"Yen-Wei","family":"Chen","sequence":"additional","affiliation":[{"name":"Ritsumeikan University, Kusatsu, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4098-588X","authenticated-orcid":false,"given":"Lanfen","family":"Lin","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"A neural probabilistic language model. Advances in neural information processing systems","author":"Bengio Yoshua","year":"2000","unstructured":"Yoshua Bengio, R\u00e9jean Ducharme, and Pascal Vincent. 2000. A neural probabilistic language model. Advances in neural information processing systems, Vol. 13 (2000)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_3_1","volume-title":"Deeplab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs","author":"Chen Liang-Chieh","year":"2017","unstructured":"Liang-Chieh Chen, George Papandreou, Iasonas Kokkinos, Kevin Murphy, and Alan L Yuille. 2017. Deeplab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs. IEEE transactions on pattern analysis and machine intelligence, Vol. 40, 4 (2017), 834--848."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12281"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00061"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00532"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME52920.2022.9860016"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/1646396.1646452"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00359"},{"key":"e_1_3_2_1_10_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_11_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_12_1","volume-title":"Christopher KI Williams, John Winn, and Andrew Zisserman.","author":"Everingham Mark","year":"2010","unstructured":"Mark Everingham, Luc Van Gool, Christopher KI Williams, John Winn, and Andrew Zisserman. 2010. The pascal visual object classes (voc) challenge. International journal of computer vision, Vol. 88 (2010), 303--338."},{"key":"e_1_3_2_1_13_1","volume-title":"Multi-label image recognition with multi-class attentional regions. arXiv e-prints","author":"Gao Bin-Bin","year":"2020","unstructured":"Bin-Bin Gao and Hong-Yu Zhou. 2020. Multi-label image recognition with multi-class attentional regions. arXiv e-prints (2020), arXiv-2007."},{"key":"e_1_3_2_1_14_1","volume-title":"Is attention better than matrix decomposition? arXiv preprint arXiv:2109.04553","author":"Geng Zhengyang","year":"2021","unstructured":"Zhengyang Geng, Meng-Hao Guo, Hongxu Chen, Xia Li, Ke Wei, and Zhouchen Lin. 2021. Is attention better than matrix decomposition? arXiv preprint arXiv:2109.04553 (2021)."},{"key":"e_1_3_2_1_15_1","volume-title":"Deep convolutional ranking for multilabel image annotation. arXiv preprint arXiv:1312.4894","author":"Gong Yunchao","year":"2013","unstructured":"Yunchao Gong, Yangqing Jia, Thomas Leung, Alexander Toshev, and Sergey Ioffe. 2013. Deep convolutional ranking for multilabel image annotation. arXiv preprint arXiv:1312.4894 (2013)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01178"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11770"},{"key":"e_1_3_2_1_19_1","volume-title":"CNN-RNN: A Unified Framework for Multi-label Image Classification. In 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Jiang W.","unstructured":"W. Jiang, Y. Yi, J. Mao, Z. Huang, and X. Wei. 2016. CNN-RNN: A Unified Framework for Multi-label Image Classification. In 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01621"},{"key":"e_1_3_2_1_21_1","volume-title":"Chen Change Loy, and Xiaoou Tang","author":"Li Yining","year":"2016","unstructured":"Yining Li, Chen Huang, Chen Change Loy, and Xiaoou Tang. 2016. Human attribute recognition by deep hierarchical contexts. In Computer Vision-ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part VI 14. Springer, 684--700."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_23_1","volume-title":"Query2label: A simple transformer way to multi-label classification. arXiv preprint arXiv:2107.10834","author":"Liu Shilong","year":"2021","unstructured":"Shilong Liu, Lei Zhang, Xiao Yang, Hang Su, and Jun Zhu. 2021b. Query2label: A simple transformer way to multi-label classification. arXiv preprint arXiv:2107.10834 (2021)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240567"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00015"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299097"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00717"},{"key":"e_1_3_2_1_31_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.251"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2016.2612829"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.58"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2913513"},{"key":"e_1_3_2_1_36_1","volume-title":"HCP: A flexible CNN framework for multi-label image classification","author":"Wei Yunchao","year":"2015","unstructured":"Yunchao Wei, Wei Xia, Min Lin, Junshi Huang, Bingbing Ni, Jian Dong, Yao Zhao, and Shuicheng Yan. 2015. HCP: A flexible CNN framework for multi-label image classification. IEEE transactions on pattern analysis and machine intelligence, Vol. 38, 9 (2015), 1901--1907."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"e_1_3_2_1_38_1","volume-title":"GM-MLIC: graph matching based multi-label image classification. arXiv preprint arXiv:2104.14762","author":"Wu Yanan","year":"2021","unstructured":"Yanan Wu, He Liu, Songhe Feng, Yi Jin, Gengyu Lyu, and Zizhang Wu. 2021. GM-MLIC: graph matching based multi-label image classification. arXiv preprint arXiv:2104.14762 (2021)."},{"key":"e_1_3_2_1_39_1","first-page":"12077","article-title":"SegFormer: Simple and efficient design for semantic segmentation with transformers","volume":"34","author":"Xie Enze","year":"2021","unstructured":"Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M Alvarez, and Ping Luo. 2021. SegFormer: Simple and efficient design for semantic segmentation with transformers. Advances in Neural Information Processing Systems, Vol. 34 (2021), 12077--12090.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3002185"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.37"},{"key":"e_1_3_2_1_42_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 13440--13449","author":"Yazici Vacit Oguz","unstructured":"Vacit Oguz Yazici, Abel Gonzalez-Garcia, Arnau Ramisa, Bartlomiej Twardowski, and Joost van de Weijer. 2020. Orderless recurrent models for multi-label classification. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 13440--13449."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_39"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6964"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2812605"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.660"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475191"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.219"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00025"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548343"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612159","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612159","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:04:12Z","timestamp":1755821052000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612159"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":51,"alternative-id":["10.1145\/3581783.3612159","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612159","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}