{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,30]],"date-time":"2026-01-30T03:29:43Z","timestamp":1769743783526,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658005","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"740-748","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Pyramidal Cross-Modal Transformer with Sustained Visual Guidance for Multi-Label Image Classification"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4424-8249","authenticated-orcid":false,"given":"Zhuohua","family":"Li","sequence":"first","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7113-3017","authenticated-orcid":false,"given":"Ruyun","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7061-3329","authenticated-orcid":false,"given":"Fuqing","family":"Zhu","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1107-3873","authenticated-orcid":false,"given":"Jizhong","family":"Han","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Science, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7170-3809","authenticated-orcid":false,"given":"Songlin","family":"Hu","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"CVPR workshops. 9--17","author":"Cevikalp Hakan","year":"2019","unstructured":"Hakan Cevikalp, Burak Benligiray, \u00d6mer Nezih Gerek, and Hasan Saribas. 2019. Semi-Supervised Robust Deep Neural Networks for Multi-Label Classification.. In CVPR workshops. 9--17."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3591106.3592267"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00061"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00532"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME52920.2022.9860016"},{"key":"e_1_3_2_1_6_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_7_1","volume-title":"Christopher KI Williams, John Winn, and Andrew Zisserman.","author":"Everingham Mark","year":"2010","unstructured":"Mark Everingham, Luc Van Gool, Christopher KI Williams, John Winn, and Andrew Zisserman. 2010. The pascal visual object classes (voc) challenge. International journal of computer vision 88 (2010), 303--338."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3088605"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_11_1","volume-title":"AQT: Adversarial Query Transformers for Domain Adaptive Object Detection.. In IJCAI. 972--979.","author":"Huang Wei-Jie","year":"2022","unstructured":"Wei-Jie Huang, Yu-Lin Lu, Shih-Yao Lin, Yusheng Xie, and Yen-Yu Lin. 2022. AQT: Adversarial Query Transformers for Domain Adaptive Object Detection.. In IJCAI. 972--979."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01621"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00170"},{"key":"e_1_3_2_1_15_1","volume-title":"Learning category correlations for multi-label image recognition with graph networks. arXiv preprint arXiv:1909.13005","author":"Li Qing","year":"2019","unstructured":"Qing Li, Xiaojiang Peng, Yu Qiao, and Qiang Peng. 2019. Learning category correlations for multi-label image recognition with graph networks. arXiv preprint arXiv:1909.13005 (2019)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3591106.3592269"},{"key":"e_1_3_2_1_17_1","volume-title":"UFI: A Unified Feature Interaction Framework for Multi-Label Image Classification. In 2022 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1--6.","author":"Li Zhuohua","year":"2022","unstructured":"Zhuohua Li, Weibo Zhang, Ziang Yang, Fuqing Zhu, Jizhong Han, and Songlin Hu. 2022. UFI: A Unified Feature Interaction Framework for Multi-Label Image Classification. In 2022 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1--6."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_19_1","volume-title":"Query2label: A simple transformer way to multi-label classification. arXiv preprint arXiv:2107.10834","author":"Liu Shilong","year":"2021","unstructured":"Shilong Liu, Lei Zhang, Xiao Yang, Hang Su, and Jun Zhu. 2021. Query2label: A simple transformer way to multi-label classification. arXiv preprint arXiv:2107.10834 (2021)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3338533.3366589"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i10.17098"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00015"},{"key":"e_1_3_2_1_24_1","volume-title":"International conference on machine learning. PMLR, 6105--6114","author":"Tan Mingxing","year":"2019","unstructured":"Mingxing Tan and Quoc Le. 2019. Efficientnet: Rethinking model scaling for convolutional neural networks. In International conference on machine learning. PMLR, 6105--6114."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20053-3_27"},{"key":"e_1_3_2_1_26_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.251"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6909"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.58"},{"key":"e_1_3_2_1_30_1","volume-title":"HCP: A flexible CNN framework for multi-label image classification","author":"Wei Yunchao","year":"2015","unstructured":"Yunchao Wei,Wei Xia, Min Lin, Junshi Huang, Bingbing Ni, Jian Dong, Yao Zhao, and Shuicheng Yan. 2015. HCP: A flexible CNN framework for multi-label image classification. IEEE transactions on pattern analysis and machine intelligence 38, 9 (2015), 1901--1907."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3002185"},{"key":"e_1_3_2_1_32_1","volume-title":"Billion-scale semi-supervised learning for image classification. arXiv preprint arXiv:1905.00546","author":"Yalniz I Zeki","year":"2019","unstructured":"I Zeki Yalniz, Herv\u00e9 J\u00e9gou, Kan Chen, Manohar Paluri, and Dhruv Mahajan. 2019. Billion-scale semi-supervised learning for image classification. arXiv preprint arXiv:1905.00546 (2019)."},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 13440--13449","author":"Yazici Vacit Oguz","unstructured":"Vacit Oguz Yazici, Abel Gonzalez-Garcia, Arnau Ramisa, Bartlomiej Twardowski, and Joost van de Weijer. 2020. Orderless recurrent models for multi-label classification. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 13440--13449."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_39"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3595923"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547834"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2812605"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475191"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3519030"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.219"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548343"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00142"}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","location":"Phuket Thailand","acronym":"ICMR '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658005","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658005","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:53:32Z","timestamp":1755766412000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658005"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":42,"alternative-id":["10.1145\/3652583.3658005","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658005","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}