{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,25]],"date-time":"2026-01-25T15:48:17Z","timestamp":1769356097014,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754966","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:56:43Z","timestamp":1761371803000},"page":"1112-1121","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Discrepancy-Aware Attention Network for Enhanced Audio-Visual Generalized Zero-Shot Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-6747-581X","authenticated-orcid":false,"given":"Runlin","family":"Yu","sequence":"first","affiliation":[{"name":"Central South University, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6388-126X","authenticated-orcid":false,"given":"Yipu","family":"Gong","sequence":"additional","affiliation":[{"name":"Central South University, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2393-9016","authenticated-orcid":false,"given":"Wenrui","family":"Li","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Harbin, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5994-0640","authenticated-orcid":false,"given":"Aiwen","family":"Sun","sequence":"additional","affiliation":[{"name":"Central South University, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0984-208X","authenticated-orcid":false,"given":"Mengren","family":"Zheng","sequence":"additional","affiliation":[{"name":"Chong Qing University, Chongqing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"4660","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Asano Yuki","year":"2020","unstructured":"Yuki Asano, Mandela Patrick, Christian Rupprecht, and Andrea Vedaldi. 2020. Labelling unlabelled videos from scratch with multi-modal self-supervision. In Advances in Neural Information Processing Systems, Vol. 33. Curran Associates, Inc., 4660-4671. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2020\/file\/31fefc0e570cb3860f2a6d4b38c6490d-Paper.pdf"},{"key":"e_1_3_2_1_2_1","volume-title":"An Empirical Evaluation of Generic Convolutional and Recurrent Networks for Sequence Modeling. ArXiv","author":"Bai Shaojie","year":"2018","unstructured":"Shaojie Bai, J. Zico Kolter, and Vladlen Koltun. 2018. An Empirical Evaluation of Generic Convolutional and Recurrent Networks for Sequence Modeling. ArXiv, Vol. abs\/1803.01271 (2018). https:\/\/api.semanticscholar.org\/CorpusID:4747877"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2024.3441816"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102757"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/JBHI.2025.3546950"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01918"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00724"},{"key":"e_1_3_2_1_10_1","volume-title":"Modality Competition: What Makes Joint Training of Multi-modal Network Fail in Deep Learning? (Provably). ArXiv","author":"Huang Yu","year":"2022","unstructured":"Yu Huang, Junyang Lin, Chang Zhou, Hongxia Yang, and Longbo Huang. 2022. Modality Competition: What Makes Joint Training of Multi-modal Network Fail in Deep Learning? (Provably). ArXiv, Vol. abs\/2203.12221 (2022). https:\/\/api.semanticscholar.org\/CorpusID:247618748"},{"key":"e_1_3_2_1_11_1","volume-title":"Improving multimodal accuracy through modality pre-training and attention. arXiv preprint arXiv:2011.06102","author":"Ismail Aya Abdelsalam","year":"2020","unstructured":"Aya Abdelsalam Ismail, Mahmudul Hasan, and Faisal Ishtiaq. 2020. Improving multimodal accuracy through modality pre-training and attention. arXiv preprint arXiv:2011.06102 (2020)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00269"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02030"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681559"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3368964"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3233042"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME55011.2023.00080"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611758"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3430080"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3394551"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611759"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/VCIP63160.2024.10849917"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Zhuoyuan Li Junqi Liao Chuanbo Tang Haotian Zhang Yuqi Li Yifan Bian Xihua Sheng Xinmin Feng Yao Li Changsheng Gao et al. 2025b. USTC-TD: A Test Dataset and Benchmark for Image and Video Coding in 2020s. IEEE Transactions on Multimedia (2025) 1-16.","DOI":"10.1109\/TMM.2025.3608643"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBC.2024.3434520"},{"key":"e_1_3_2_1_25_1","first-page":"3089","volume-title":"2021 IEEE Winter Conference on Applications of Computer Vision (WACV) (2020","author":"Mazumder Pratik","year":"1890","unstructured":"Pratik Mazumder, Pravendra Singh, Kranti K. Parida, and Vinay P. Namboodiri. 2020. AVGZSLNet: Audio-Visual Generalized Zero-Shot Learning by Reconstructing Label Features from Multi-Modal Embeddings. 2021 IEEE Winter Conference on Applications of Computer Vision (WACV) (2020), 3089-3098. https:\/\/api.semanticscholar.org\/CorpusID:218900545"},{"key":"e_1_3_2_1_26_1","first-page":"488","article-title":"Temporal and Cross-modal Attention for Audio-Visual Zero-Shot Learning","volume":"2022","author":"Mercea Otniel-Bogdan","year":"2022","unstructured":"Otniel-Bogdan Mercea, Thomas Hummel, A. Sophia Koepke, and Zeynep Akata. 2022a. Temporal and Cross-modal Attention for Audio-Visual Zero-Shot Learning. In Computer Vision - ECCV 2022. 488-505.","journal-title":"Computer Vision - ECCV"},{"key":"e_1_3_2_1_27_1","first-page":"10543","volume-title":"Audiovisual Generalised Zero-shot Learning with Cross-modal Attention and Language. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Mercea Otniel-Bogdan","year":"2022","unstructured":"Otniel-Bogdan Mercea, Lukas Riesch, A. Sophia Koepke, and Zeynep Akata. 2022b. Audiovisual Generalised Zero-shot Learning with Cross-modal Attention and Language. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2022), 10543-10553. https:\/\/api.semanticscholar.org\/CorpusID:247292578"},{"key":"e_1_3_2_1_28_1","volume-title":"Computer Vision - ECCV","author":"Mo Shentong","year":"2024","unstructured":"Shentong Mo and Pedro Morgado. 2025. Audio-Visual Generalized Zero-Shot Learning the\u00a0Easy Way. In Computer Vision - ECCV 2024, Ale\u0161 Leonardis, Elisa Ricci, Stefan Roth, Olga Russakovsky, Torsten Sattler, and G\u00fcl Varol (Eds.). Springer Nature Switzerland, Cham, 377-395."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","unstructured":"Kranti Kumar Parida Neeraj Matiyali Tanaya Guha and Gaurav Sharma. 2020. Coordinated Joint Multimodal Embeddings for Generalized Audio-Visual Zero-shot Classification and Retrieval of Videos. In 2020 IEEE Winter Conference on Applications of Computer Vision (WACV). 3240-3249. doi:10.1109\/WACV45572.2020.9093438","DOI":"10.1109\/WACV45572.2020.9093438"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00806"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475647"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613435"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00844"},{"key":"e_1_3_2_1_34_1","volume-title":"UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild. ArXiv","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Zamir, and Mubarak Shah. 2012. UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild. ArXiv, Vol. abs\/1212.0402 (2012). https:\/\/api.semanticscholar.org\/CorpusID:7197134"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2021.3101421"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Chuanbo Tang Zhuoyuan Li Yifan Bian Li Li and Dong Liu. 2025. Neural Video Compression with Context Modulation. arXiv:2505.14541 [eess.IV] https:\/\/arxiv.org\/abs\/2505.14541","DOI":"10.1109\/CVPR52734.2025.01171"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28317"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548382"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01271"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3293318"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02581"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3468315"},{"key":"e_1_3_2_1_43_1","volume-title":"Geras","author":"Wu Nan","year":"2022","unstructured":"Nan Wu, Stanislaw Jastrzbski, Kyunghyun Cho, and Krzysztof J. Geras. 2022. Characterizing and overcoming the greedy nature of learning in multi-modal deep neural networks. arXiv:2202.05306 [cs.LG] https:\/\/arxiv.org\/abs\/2202.05306"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2857768"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3065234"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2025.129636"},{"key":"e_1_3_2_1_47_1","unstructured":"Tianzhu Ye Li Dong Yuqing Xia Yutao Sun Yi Zhu Gao Huang and Furu Wei. 2024. Differential Transformer. arXiv:2410.05258 [cs.CL] https:\/\/arxiv.org\/abs\/2410.05258"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3713070"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680626"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.3390\/math12142200"},{"key":"e_1_3_2_1_51_1","volume-title":"Multimodal Fusion on Low-quality Data: A Comprehensive Survey. ArXiv","author":"Zhang Qingyang","year":"1894","unstructured":"Qingyang Zhang, Yake Wei, Zongbo Han, Huazhu Fu, Xi Peng, Cheng Deng, Qinghua Hu, Cai Xu, Jie Wen, Di Hu, and Changqing Zhang. 2024b. Multimodal Fusion on Low-quality Data: A Comprehensive Survey. ArXiv, Vol. abs\/2404.18947 (2024). https:\/\/api.semanticscholar.org\/CorpusID:269457475"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3135440"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3326294"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28581"},{"key":"e_1_3_2_1_55_1","first-page":"7905","volume-title":"Oh (Eds.)","volume":"35","author":"Zhao Rui","year":"2022","unstructured":"Rui Zhao, Ruiqin Xiong, Jing Zhao, Zhaofei Yu, Xiaopeng Fan, and Tiejun Huang. 2022b. Learning Optical Flow from Continuous Spike Streams. In Advances in Neural Information Processing Systems, S. Koyejo, S. Mohamed, A. Agarwal, D. Belgrave, K. Cho, and A. Oh (Eds.), Vol. 35. Curran Associates, Inc., 7905-7920. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/file\/33951c28630e48c441cb59db356f2037-Paper-Conference.pdf"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/VCIP49819.2020.9301771"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN54540.2023.10191705"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN54540.2023.10191705"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58523-5_46"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754966","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:07:19Z","timestamp":1765339639000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754966"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":59,"alternative-id":["10.1145\/3746027.3754966","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754966","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}