{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:10:14Z","timestamp":1765339814291,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","funder":[{"name":"Hong Kong RGC General Research Fund","award":["No. 15221123 and 15216424"],"award-info":[{"award-number":["No. 15221123 and 15216424"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755055","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:56:43Z","timestamp":1761371803000},"page":"3360-3369","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Balancing Cross-Modal Attention for Generalized Zero-Shot Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-9580-1133","authenticated-orcid":false,"given":"Zhijie","family":"Rao","sequence":"first","affiliation":[{"name":"Department of Computing, The Hong Kong Polytechnic University, Hong Kong SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0449-4525","authenticated-orcid":false,"given":"Jingcai","family":"Guo","sequence":"additional","affiliation":[{"name":"Department of Computing &amp; LSGI, The Hong Kong Polytechnic University, Hong Kong SAR, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 7603--7612","author":"Annadani Yashas","year":"2018","unstructured":"Yashas Annadani and Soma Biswas. 2018. Preserving semantic relations for zero-shot learning. In Proceedings of the IEEE conference on computer vision and pattern recognition. 7603--7612."},{"key":"e_1_3_2_1_2_1","volume-title":"International conference on machine learning. PMLR, 872--881","author":"Byrd Jonathon","year":"2019","unstructured":"Jonathon Byrd and Zachary Lipton. 2019. What is the effect of importance weighting in deep learning?. In International conference on machine learning. PMLR, 872--881."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_3_1","DOI":"10.1007\/978-3-319-46475-6_4"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_4_1","DOI":"10.1609\/aaai.v36i1.19909"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_5_1","DOI":"10.1109\/CVPR52688.2022.00746"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_6_1","DOI":"10.1109\/CVPR52733.2024.02262"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_7_1","DOI":"10.1109\/ICCV48922.2021.00019"},{"key":"e_1_3_2_1_8_1","first-page":"16622","article-title":"Hsva: Hierarchical semantic-visual adaptation for zero-shot learning","volume":"34","author":"Chen Shiming","year":"2021","unstructured":"Shiming Chen, Guosen Xie, Yang Liu, Qinmu Peng, Baigui Sun, Hao Li, Xinge You, and Ling Shao. 2021. Hsva: Hierarchical semantic-visual adaptation for zero-shot learning. Advances in Neural Information Processing Systems 34 (2021), 16622--16634.","journal-title":"Advances in Neural Information Processing Systems"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_9_1","DOI":"10.1609\/aaai.v37i1.25114"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the 31st ACM international conference on multimedia. 4138--4146","author":"Chen Zhi","year":"2023","unstructured":"Zhi Chen, Pengfei Zhang, Jingjing Li, Sen Wang, and Zi Huang. 2023. Zeroshot learning by harnessing adversarial samples. In Proceedings of the 31st ACM international conference on multimedia. 4138--4146."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_11_1","DOI":"10.1109\/CVPR.2019.00949"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_12_1","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_13_1","volume-title":"International Conference on Learning Representations.","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, DirkWeissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, G Heigold, S Gelly, et al. 2020. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_14_1","DOI":"10.1109\/CVPRW63382.2024.00775"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_15_1","DOI":"10.1109\/TPAMI.2015.2408354"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_16_1","DOI":"10.1145\/3503161.3547889"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_17_1","DOI":"10.1145\/3422622"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_18_1","DOI":"10.1109\/CVPRW59228.2023.00011"},{"volume-title":"Mamba: Linear-Time Sequence Modeling with Selective State Spaces. In First Conference on Language Modeling.","author":"Gu Albert","unstructured":"Albert Gu and Tri Dao. [n. d.]. Mamba: Linear-Time Sequence Modeling with Selective State Spaces. In First Conference on Language Modeling.","key":"e_1_3_2_1_19_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_20_1","DOI":"10.1609\/aaai.v37i6.25942"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_21_1","DOI":"10.1109\/TKDE.2008.239"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_22_1","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_23_1","volume-title":"ZeroMamba: Exploring Visual State Space Model for Zero-Shot Learning. AAAI Conference on Artificial Intelligence","author":"Hou Wenjin","year":"2025","unstructured":"Wenjin Hou, Dingjie Fu, Kun Li, Shiming Chen, Hehe Fan, and Yi Yang. 2025. ZeroMamba: Exploring Visual State Space Model for Zero-Shot Learning. AAAI Conference on Artificial Intelligence (2025)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_24_1","DOI":"10.1109\/CVPR.2016.580"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_25_1","DOI":"10.1145\/3664647.3681629"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_26_1","DOI":"10.1109\/CVPR42600.2020.00454"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_27_1","DOI":"10.1109\/TCYB.2020.3004641"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_28_1","DOI":"10.1109\/ICCV.2019.00986"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_29_1","DOI":"10.1609\/aaai.v36i1.20002"},{"unstructured":"Diederik P Kingma Max Welling et al. [n. d.]. Auto-encoding variational bayes.","key":"e_1_3_2_1_30_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_31_1","DOI":"10.1109\/CVPR52688.2022.00909"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_32_1","DOI":"10.1109\/CVPR.2009.5206594"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_33_1","DOI":"10.1609\/aaai.v35i3.16292"},{"key":"e_1_3_2_1_34_1","volume-title":"VS-Boost: Boosting Visual-Semantic Association for Generalized Zero-Shot Learning. In International Joint Conference on Artificial Intelligence.","author":"Li Xiaofan","year":"2023","unstructured":"Xiaofan Li, Yachao Zhang, Shiran Bian, Yanyun Qu, Yuan Xie, Zhongchao Shi, and Jianping Fan. 2023. VS-Boost: Boosting Visual-Semantic Association for Generalized Zero-Shot Learning. In International Joint Conference on Artificial Intelligence."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_35_1","DOI":"10.1109\/ICCV.2017.324"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_36_1","DOI":"10.1609\/aaai.v39i5.32586"},{"key":"e_1_3_2_1_37_1","volume-title":"PSVMA: exploring Multi-granularity semantic-visual adaption for generalized zero-shot learning","author":"Liu Man","year":"2024","unstructured":"Man Liu, Huihui Bai, Feng Li, Chunjie Zhang, YunchaoWei, MengWang, Tat-Seng Chua, and Yao Zhao. 2024. PSVMA: exploring Multi-granularity semantic-visual adaption for generalized zero-shot learning. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_38_1","DOI":"10.1109\/CVPR52729.2023.01472"},{"key":"e_1_3_2_1_39_1","volume-title":"Vmamba: Visual state space model. Advances in neural information processing systems 37","author":"Liu Yue","year":"2024","unstructured":"Yue Liu, Yunjie Tian, Yuzhong Zhao, Hongtian Yu, Lingxi Xie, Yaowei Wang, Qixiang Ye, Jianbin Jiao, and Yunfan Liu. 2024. Vmamba: Visual state space model. Advances in neural information processing systems 37 (2024), 103031--103063."},{"key":"e_1_3_2_1_40_1","first-page":"12283","article-title":"I2dformer: Learning image to document attention for zero-shot image classification","volume":"35","author":"Naeem Muhammad Ferjad","year":"2022","unstructured":"Muhammad Ferjad Naeem, Yongqin Xian, Luc V Gool, and Federico Tombari. 2022. I2dformer: Learning image to document attention for zero-shot image classification. Advances in Neural Information Processing Systems 35 (2022), 12283--12294.","journal-title":"Advances in Neural Information Processing Systems"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_41_1","DOI":"10.1109\/CVPR.2012.6247998"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_42_1","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence. 4833--4841","author":"Rao Zhijie","year":"2024","unstructured":"Zhijie Rao, Jingcai Guo, Xiaocheng Lu, Jingming Liang, Jie Zhang, Haozhao Wang, Kang Wei, and Xiaofeng Cao. 2024. Dual expert distillation network for generalized zero-shot learning. In Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence. 4833--4841."},{"key":"e_1_3_2_1_44_1","volume-title":"Srcd: Semantic reasoning with compound domains for single-domain generalized object detection","author":"Rao Zhijie","year":"2024","unstructured":"Zhijie Rao, Jingcai Guo, Luyao Tang, Yue Huang, Xinghao Ding, and Song Guo. 2024. Srcd: Semantic reasoning with compound domains for single-domain generalized object detection. IEEE Transactions on Neural Networks and Learning Systems (2024)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_45_1","DOI":"10.1109\/CVPR52688.2022.00777"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_46_1","DOI":"10.1109\/WACV48630.2021.00033"},{"key":"e_1_3_2_1_47_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_48_1","DOI":"10.1109\/CVPR.2018.00450"},{"unstructured":"CatherineWah Steve Branson PeterWelinder Pietro Perona and Serge Belongie. 2011. The caltech-ucsd birds-200--2011 dataset. (2011).","key":"e_1_3_2_1_49_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_50_1","DOI":"10.1609\/aaai.v38i6.28381"},{"key":"e_1_3_2_1_51_1","volume-title":"Learning to model the tail. Advances in neural information processing systems 30","author":"Wang Yu-Xiong","year":"2017","unstructured":"Yu-Xiong Wang, Deva Ramanan, and Martial Hebert. 2017. Learning to model the tail. Advances in neural information processing systems 30 (2017)."},{"unstructured":"Zhengbo Wang Jian Liang Ran He Nan Xu Zilei Wang and Tieniu Tan. 2023. Improving zero-shot generalization for clip with synthesized prompts. (2023) 3032--3042.","key":"e_1_3_2_1_52_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_53_1","DOI":"10.1109\/CVPR.2018.00581"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_54_1","DOI":"10.1109\/CVPR.2017.328"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_55_1","DOI":"10.1109\/CVPR.2019.00961"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_56_1","DOI":"10.1109\/CVPR52688.2022.00910"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_57_1","DOI":"10.1109\/TIP.2023.3295738"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_58_1","DOI":"10.1109\/CVPR.2017.321"}],"event":{"sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"acronym":"MM '25","name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755055","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:07:59Z","timestamp":1765339679000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755055"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":58,"alternative-id":["10.1145\/3746027.3755055","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755055","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}