{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T00:43:55Z","timestamp":1775695435004,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":69,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022YFC3301000"],"award-info":[{"award-number":["2022YFC3301000"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680694","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"4246-4255","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Causal Visual-semantic Correlation for Zero-shot Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6314-4114","authenticated-orcid":false,"given":"Shuhuang","family":"Chen","sequence":"first","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1327-8312","authenticated-orcid":false,"given":"Dingjie","family":"Fu","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9633-3392","authenticated-orcid":false,"given":"Shiming","family":"Chen","sequence":"additional","affiliation":[{"name":"Mohamed bin Zayed University of Artificial Intelligence, Abu Dhabi, United Arab Emirates"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7756-8233","authenticated-orcid":false,"given":"Shuo","family":"Ye","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1186-2373","authenticated-orcid":false,"given":"Wenjin","family":"Hou","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6227-1346","authenticated-orcid":false,"given":"Xinge","family":"You","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Faisal Alamri and Anjan Dutta. 2021. Multi-Head Self-Attention via Vision Transformer for Zero-Shot Learning. In IMVIP."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-92659-5_30"},{"key":"e_1_3_2_1_3_1","first-page":"1462","article-title":"A causal view of compositional zero-shot recognition","volume":"33","author":"Atzmon Yuval","year":"2020","unstructured":"Yuval Atzmon, Felix Kreuk, Uri Shalit, and Gal Chechik. 2020. A causal view of compositional zero-shot recognition. In NIPS, Vol. 33. 1462--1473.","journal-title":"NIPS"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.01043"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_4"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00115"},{"key":"e_1_3_2_1_7_1","volume-title":"TransZero: Cross attribute-guided transformer for zero-shot learning","author":"Chen Shiming","year":"2022","unstructured":"Shiming Chen, Ziming Hong, Wenjin Hou, Guo-Sen Xie, Yibing Song, Jian Zhao, Xinge You, Shuicheng Yan, and Ling Shao. 2022. TransZero: Cross attribute-guided transformer for zero-shot learning. IEEE transactions on pattern analysis and machine intelligence, Vol. 45, 11 (2022), 12844--12861."},{"key":"e_1_3_2_1_8_1","first-page":"3","article-title":"Transzero: Attribute-guided transformer for zero-shot learning","volume":"2","author":"Chen Shiming","year":"2022","unstructured":"Shiming Chen, Ziming Hong, Yang Liu, Guo-Sen Xie, Baigui Sun, Hao Li, Qinmu Peng, Ke Lu, and Xinge You. 2022. Transzero: Attribute-guided transformer for zero-shot learning. In AAAI, Vol. 2. 3.","journal-title":"AAAI"},{"key":"e_1_3_2_1_9_1","volume-title":"GNDAN: Graph navigated dual attention network for zero-shot learning","author":"Chen Shiming","year":"2022","unstructured":"Shiming Chen, Ziming Hong, Guosen Xie, Qinmu Peng, Xinge You, Weiping Ding, and Ling Shao. 2022. GNDAN: Graph navigated dual attention network for zero-shot learning. IEEE transactions on neural networks and learning systems, Vol. 35, 4 (2022), 4516--4529."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00746"},{"key":"e_1_3_2_1_11_1","volume-title":"International Conference on Machine Learning. PMLR, 4611--4622","author":"Chen Shiming","year":"2023","unstructured":"Shiming Chen, Wenjin Hou, Ziming Hong, Xiaohan Ding, Yibing Song, Xinge You, Tongliang Liu, and Kun Zhang. 2023. Evolving semantic prototype improves generative zero-shot learning. In International Conference on Machine Learning. PMLR, 4611--4622."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00019"},{"key":"e_1_3_2_1_13_1","first-page":"16622","article-title":"Hsva: Hierarchical semantic-visual adaptation for zero-shot learning","volume":"34","author":"Chen Shiming","year":"2021","unstructured":"Shiming Chen, GuoSen Xie, Yang Liu, Qinmu Peng, Baigui Sun, Hao Li, Xinge You, and Ling Shao. 2021. Hsva: Hierarchical semantic-visual adaptation for zero-shot learning. Advances in Neural Information Processing Systems, Vol. 34 (2021), 16622--16634.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25114"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Zhi Chen Yadan Luo Sen Wang Ruihong Qiu Jingjing Li and Zi Huang. 2021. Mitigating generation shifts for generalized zero-shot learning. In ACM MM. 844--852.","DOI":"10.1145\/3474085.3475258"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611823"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_18_1","volume-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ICLR","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ICLR (2021)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.666"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206772"},{"key":"e_1_3_2_1_21_1","volume-title":"Learning visual attributes. Advances in neural information processing systems","author":"Ferrari Vittorio","year":"2007","unstructured":"Vittorio Ferrari and Andrew Zisserman. 2007. Learning visual attributes. Advances in neural information processing systems, Vol. 20 (2007)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547889"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00240"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Ziming Hong Shiming Chen Guo-Sen Xie Wenhan Yang Jian Zhao Yuanjie Shao Qinmu Peng and Xinge You. 2022. Semantic Compression Embedding for Generative Zero-Shot Learning. In IJCAI. 956--963.","DOI":"10.24963\/ijcai.2022\/134"},{"key":"e_1_3_2_1_25_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Hong Ziming","year":"2024","unstructured":"Ziming Hong, Zhenyi Wang, Li Shen, Yu Yao, Zhuo Huang, Shiming Chen, Chuanwu Yang, Mingming Gong, and Tongliang Liu. 2024. Improving non-transferable representation learning by harnessing content and style. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02230"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00454"},{"key":"e_1_3_2_1_28_1","volume-title":"Kingma and Jimmy Ba","author":"Diederik","year":"2015","unstructured":"Diederik P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In ICLR."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206594"},{"key":"e_1_3_2_1_30_1","volume-title":"Attribute-based classification for zero-shot visual object categorization","author":"Lampert Christoph H","year":"2013","unstructured":"Christoph H Lampert, Hannes Nickisch, and Stefan Harmeling. 2013. Attribute-based classification for zero-shot visual object categorization. IEEE transactions on pattern analysis and machine intelligence, Vol. 36, 3 (2013), 453--465."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.483"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.553"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01472"},{"key":"e_1_3_2_1_34_1","volume-title":"Generalized zero-shot learning with deep calibration network. Advances in neural information processing systems","author":"Liu Shichen","year":"2018","unstructured":"Shichen Liu, Mingsheng Long, Jianmin Wang, and Michael I Jordan. 2018. Generalized zero-shot learning with deep calibration network. Advances in neural information processing systems, Vol. 31 (2018)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00379"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00788"},{"key":"e_1_3_2_1_37_1","volume-title":"Distributed representations of words and phrases and their compositionality. Advances in neural information processing systems","author":"Mikolov Tomas","year":"2013","unstructured":"Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg S Corrado, and Jeff Dean. 2013. Distributed representations of words and phrases and their compositionality. Advances in neural information processing systems, Vol. 26 (2013)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3372850"},{"key":"e_1_3_2_1_39_1","volume-title":"Zero-shot learning with semantic output codes. Advances in neural information processing systems","author":"Palatucci Mark","year":"2009","unstructured":"Mark Palatucci, Dean Pomerleau, Geoffrey E Hinton, and Tom M Mitchell. 2009. Zero-shot learning with semantic output codes. Advances in neural information processing systems, Vol. 22 (2009)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247998"},{"key":"e_1_3_2_1_41_1","unstructured":"Judea Pearl. 2009. Causality. Cambridge university press."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Judea Pearl. 2022. Direct and indirect effects. In Probabilistic and causal inference: the works of Judea Pearl. 373--392.","DOI":"10.1145\/3501714.3501736"},{"key":"e_1_3_2_1_43_1","volume-title":"Causal inference in statistics: A primer","author":"Pearl Judea","unstructured":"Judea Pearl, Madelyn Glymour, and Nicholas P Jewell. 2016. Causal inference in statistics: A primer. John Wiley & Sons."},{"key":"e_1_3_2_1_44_1","unstructured":"Judea Pearl and Dana Mackenzie. 2018. The book of why: the new science of cause and effect. Basic books."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_1_46_1","volume-title":"Xi-Zhao Wang, and QM Jonathan Wu.","author":"Pourpanah Farhad","year":"2022","unstructured":"Farhad Pourpanah, Moloud Abdar, Yuxuan Luo, Xinlei Zhou, Ran Wang, Chee Peng Lim, Xi-Zhao Wang, and QM Jonathan Wu. 2022. A review of generalized zero-shot learning methods. IEEE transactions on pattern analysis and machine intelligence (2022)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00106"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.13"},{"key":"e_1_3_2_1_49_1","volume-title":"Proceedings of the 32nd International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"2161","author":"Romera-Paredes Bernardino","year":"2015","unstructured":"Bernardino Romera-Paredes and Philip Torr. 2015. An embarrassingly simple approach to zero-shot learning. In Proceedings of the 32nd International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 37),, Francis Bach and David Blei (Eds.). PMLR, Lille, France, 2152--2161."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-023-4051-9"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Xinhang Song Haitao Zeng Sixian Zhang Luis Herranz and Shuqiang Jiang. 2020. Generalized zero-shot learning with multi-source semantic embeddings for scene recognition. In ACM MM. 3976--3985.","DOI":"10.1145\/3394171.3413568"},{"key":"e_1_3_2_1_52_1","volume-title":"Explanation in causal inference: methods for mediation and interaction","author":"VanderWeele Tyler","unstructured":"Tyler VanderWeele. 2015. Explanation in causal inference: methods for mediation and interaction. Oxford University Press."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00450"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00308"},{"key":"e_1_3_2_1_55_1","unstructured":"Peter Welinder Steve Branson Takeshi Mita Catherine Wah Florian Schroff Serge Belongie and Pietro Perona. 2010. Caltech-UCSD birds 200. (2010)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1488"},{"key":"e_1_3_2_1_57_1","volume-title":"Zero-shot learning?a comprehensive evaluation of the good, the bad and the ugly","author":"Xian Yongqin","year":"2018","unstructured":"Yongqin Xian, Christoph H Lampert, Bernt Schiele, and Zeynep Akata. 2018. Zero-shot learning?a comprehensive evaluation of the good, the bad and the ugly. IEEE transactions on pattern analysis and machine intelligence, Vol. 41, 9 (2018), 2251--2265."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00581"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.328"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01052"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2021.3073655"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00972"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3295738"},{"key":"e_1_3_2_1_64_1","volume-title":"Advances in Neural Information Processing Systems","author":"Ji Zhong","year":"2018","unstructured":"yunlong yu, Zhong Ji, Yanwei Fu, Jichang Guo, Yanwei Pang, and Zhongfei (Mark) Zhang. 2018. Stacked Semantics-Guided Attention Model for Fine-Grained Zero-Shot Learning. In Advances in Neural Information Processing Systems,, S. Bengio, H. Wallach, H. Larochelle, K. Grauman, N. Cesa-Bianchi, and R. Garnett (Eds.), Vol. 31. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper\/2018\/file\/9087b0efc7c7acd1ef7e153678809c77-Paper.pdf"},{"key":"e_1_3_2_1_65_1","unstructured":"Zhongqi Yue Tan Wang Qianru Sun Xian-Sheng Hua and Hanwang Zhang. 2021. Counterfactual Zero-Shot and Open-Set Visual Recognition. In CVPR. 15404--15414."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.321"},{"key":"e_1_3_2_1_67_1","volume-title":"Deepvit: Towards deeper vision transformer. arXiv preprint arXiv:2103.11886","author":"Zhou Daquan","year":"2021","unstructured":"Daquan Zhou, Bingyi Kang, Xiaojie Jin, Linjie Yang, Xiaochen Lian, Zihang Jiang, Qibin Hou, and Jiashi Feng. 2021. Deepvit: Towards deeper vision transformer. arXiv preprint arXiv:2103.11886 (2021)."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00111"},{"key":"e_1_3_2_1_69_1","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Zhu Yizhe","year":"2019","unstructured":"Yizhe Zhu, Jianwen Xie, Zhiqiang Tang, Xi Peng, and Ahmed Elgammal. 2019. Semantic-guided multi-attention localization for zero-shot learning. Advances in Neural Information Processing Systems, Vol. 32 (2019)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680694","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680694","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:57Z","timestamp":1750295877000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680694"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":69,"alternative-id":["10.1145\/3664647.3680694","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680694","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}