{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T11:29:36Z","timestamp":1764588576735,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":69,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"the Climbing Plan Project","award":["Grant No. E3Z0261"],"award-info":[{"award-number":["Grant No. E3Z0261"]}]},{"name":"the Central Guidance for Local Special Project","award":["Grant No. Z231100005923044"],"award-info":[{"award-number":["Grant No. Z231100005923044"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680829","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"4581-4590","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Visual-Semantic Decomposition and Partial Alignment for Document-based Zero-Shot Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3658-8099","authenticated-orcid":false,"given":"Xiangyan","family":"Qu","sequence":"first","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3966-511X","authenticated-orcid":false,"given":"Jing","family":"Yu","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6784-0221","authenticated-orcid":false,"given":"Keke","family":"Gai","sequence":"additional","affiliation":[{"name":"School of Cyberspace Science and Technology, Beijing Institute of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1598-0764","authenticated-orcid":false,"given":"Jiamin","family":"Zhuang","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2849-1852","authenticated-orcid":false,"given":"Yuanmin","family":"Tang","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3190-6521","authenticated-orcid":false,"given":"Gang","family":"Xiong","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3533-4874","authenticated-orcid":false,"given":"Gaopeng","family":"Gou","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3631-256X","authenticated-orcid":false,"given":"Qi","family":"Wu","sequence":"additional","affiliation":[{"name":"Australia Institute of Machine Learning, University of Adelaide, Adelaide, Australia"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Label-Embedding for Attribute-Based Classification. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR","author":"Akata Zeynep","year":"2013","unstructured":"Zeynep Akata, Florent Perronnin, Za\"id Harchaoui, and Cordelia Schmid. 2013. Label-Embedding for Attribute-Based Classification. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2013. 819--826."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2487986"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298911"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.543"},{"key":"e_1_3_2_1_5_1","volume-title":"Predicting Deep Zero-Shot Convolutional Neural Networks Using Textual Descriptions. In IEEE\/CVF International Conference on Computer Vision, ICCV","author":"Ba Lei Jimmy","year":"2015","unstructured":"Lei Jimmy Ba, Kevin Swersky, Sanja Fidler, and Ruslan Salakhutdinov. 2015. Predicting Deep Zero-Shot Convolutional Neural Networks Using Textual Descriptions. In IEEE\/CVF International Conference on Computer Vision, ICCV 2015. 4247--4255."},{"key":"e_1_3_2_1_6_1","volume-title":"Longformer: The Long-Document Transformer. CoRR","author":"Beltagy Iz","year":"2020","unstructured":"Iz Beltagy, Matthew E. Peters, and Arman Cohan. 2020. Longformer: The Long-Document Transformer. CoRR, Vol. abs\/2004.05150 (2020). showeprint[arXiv]2004.05150"},{"key":"e_1_3_2_1_7_1","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems, NeurIPS","author":"Brown Tom B.","year":"2020","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems, NeurIPS 2020."},{"key":"e_1_3_2_1_8_1","volume-title":"Large-Scale Zero-Shot Image Classification from Rich and Diverse Textual Descriptions. CoRR","author":"Bujwid Sebastian","year":"2021","unstructured":"Sebastian Bujwid and Josephine Sullivan. 2021. Large-Scale Zero-Shot Image Classification from Rich and Diverse Textual Descriptions. CoRR, Vol. abs\/2103.09669 (2021). showeprint[arXiv]2103.09669"},{"key":"e_1_3_2_1_9_1","volume-title":"Synthesized Classifiers for Zero-Shot Learning. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR","author":"Changpinyo Soravit","year":"2016","unstructured":"Soravit Changpinyo, Wei-Lun Chao, Boqing Gong, and Fei Sha. 2016. Synthesized Classifiers for Zero-Shot Learning. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016. 5327--5336."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_4"},{"key":"e_1_3_2_1_11_1","volume-title":"MSDN: Mutually Semantic Distillation Network for Zero-Shot Learning. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR","author":"Chen Shiming","year":"2022","unstructured":"Shiming Chen, Ziming Hong, Guo-Sen Xie, Wenhan Yang, Qinmu Peng, Kai Wang, Jian Zhao, and Xinge You. 2022. MSDN: Mutually Semantic Distillation Network for Zero-Shot Learning. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2022. 7602--7611."},{"key":"e_1_3_2_1_12_1","first-page":"405","article-title":"DUET: Cross-Modal Semantic Grounding for Contrastive Zero-Shot Learning. In Thirty-Seventh AAAI Conference on Artificial Intelligence, AAAI, Thirty-Fifth Conference on Innovative Applications of Artificial Intelligence","volume":"2023","author":"Chen Zhuo","year":"2023","unstructured":"Zhuo Chen, Yufeng Huang, Jiaoyan Chen, Yuxia Geng, Wen Zhang, Yin Fang, Jeff Z. Pan, and Huajun Chen. 2023. DUET: Cross-Modal Semantic Grounding for Contrastive Zero-Shot Learning. In Thirty-Seventh AAAI Conference on Artificial Intelligence, AAAI, Thirty-Fifth Conference on Innovative Applications of Artificial Intelligence, IAAI 2023. 405--413.","journal-title":"IAAI"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611823"},{"key":"e_1_3_2_1_14_1","volume-title":"Probabilistic Embeddings for Cross-Modal Retrieval. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR","author":"Chun Sanghyuk","year":"2021","unstructured":"Sanghyuk Chun, Seong Joon Oh, Rafael Sampaio de Rezende, Yannis Kalantidis, and Diane Larlus. 2021. Probabilistic Embeddings for Cross-Modal Retrieval. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021. 8415--8424."},{"volume-title":"Default Probability. Cognitive Science","year":"1991","key":"e_1_3_2_1_15_1","unstructured":"Daniel, N., Osherson, Joshua, Stern, Ormond, Wilkie, Michael, Stob, and Edward. 1991. Default Probability. Cognitive Science (1991)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019. 4171--4186."},{"key":"e_1_3_2_1_18_1","volume-title":"9th International Conference on Learning Representations, ICLR","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In 9th International Conference on Learning Representations, ICLR 2021."},{"key":"e_1_3_2_1_19_1","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition, CVPR","author":"Elhoseiny Mohamed","year":"2017","unstructured":"Mohamed Elhoseiny, Yizhe Zhu, Han Zhang, and Ahmed M. Elgammal. 2017. Link the Head to the \"Beak\": Zero Shot Learning from Noisy Text Description at Part Precision. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017. 6288--6297."},{"key":"e_1_3_2_1_20_1","volume-title":"GPTs are GPTs: An Early Look at the Labor Market Impact Potential of Large Language Models. CoRR","author":"Eloundou Tyna","year":"2023","unstructured":"Tyna Eloundou, Sam Manning, Pamela Mishkin, and Daniel Rock. 2023. GPTs are GPTs: An Early Look at the Labor Market Impact Potential of Large Language Models. CoRR, Vol. abs\/2303.10130 (2023). showeprint[arXiv]2303.10130"},{"key":"e_1_3_2_1_21_1","volume-title":"DeViSE: A Deep Visual-Semantic Embedding Model. In Advances in Neural Information Processing Systems 26: Annual Conference on Neural Information Processing Systems, NeurIPS","author":"Frome Andrea","year":"2013","unstructured":"Andrea Frome, Gregory S. Corrado, Jonathon Shlens, Samy Bengio, Jeffrey Dean, Marc'Aurelio Ranzato, and Tom\u00e1s Mikolov. 2013. DeViSE: A Deep Visual-Semantic Embedding Model. In Advances in Neural Information Processing Systems 26: Annual Conference on Neural Information Processing Systems, NeurIPS 2013. 2121--2129."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547889"},{"key":"e_1_3_2_1_23_1","volume-title":"Bridging Nonlinearities and Stochastic Regularizers with Gaussian Error Linear Units. CoRR","author":"Hendrycks Dan","year":"2016","unstructured":"Dan Hendrycks and Kevin Gimpel. 2016. Bridging Nonlinearities and Stochastic Regularizers with Gaussian Error Linear Units. CoRR, Vol. abs\/1606.08415 (2016). showeprint[arXiv]1606.08415"},{"volume-title":"The 50th Annual Meeting of the Association for Computational Linguistics, Proceedings of the Conference. 873--882","author":"Huang Eric H.","key":"e_1_3_2_1_24_1","unstructured":"Eric H. Huang, Richard Socher, Christopher D. Manning, and Andrew Y. Ng. 2012. Improving Word Representations via Global Context and Multiple Word Prototypes. In The 50th Annual Meeting of the Association for Computational Linguistics, Proceedings of the Conference. 873--882."},{"key":"e_1_3_2_1_25_1","volume-title":"Generating Visual Representations for Zero-Shot Classification. In IEEE\/CVF International Conference on Computer Vision, ICCV 2017 - Workshops. 2666--2673","author":"Jurie Fr\u00e9d\u00e9ric","year":"2017","unstructured":"Fr\u00e9d\u00e9ric Jurie, Maxime Bucher, and St\u00e9phane Herbin. 2017. Generating Visual Representations for Zero-Shot Classification. In IEEE\/CVF International Conference on Computer Vision, ICCV 2017 - Workshops. 2666--2673."},{"key":"e_1_3_2_1_26_1","volume-title":"Rethinking Knowledge Graph Propagation for Zero-Shot Learning. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR","author":"Kampffmeyer Michael","year":"2019","unstructured":"Michael Kampffmeyer, Yinbo Chen, Xiaodan Liang, Hao Wang, Yujia Zhang, and Eric P. Xing. 2019. Rethinking Knowledge Graph Propagation for Zero-Shot Learning. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2019. 11487--11496."},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings, The Twenty-First National Conference on Artificial Intelligence and the Eighteenth Innovative Applications of Artificial Intelligence Conference. 381--388","author":"Kemp Charles","year":"2006","unstructured":"Charles Kemp, Joshua B. Tenenbaum, Thomas L. Griffiths, Takeshi Yamada, and Naonori Ueda. 2006. Learning Systems of Concepts with an Infinite Relational Model. In Proceedings, The Twenty-First National Conference on Artificial Intelligence and the Eighteenth Innovative Applications of Artificial Intelligence Conference. 381--388."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.250"},{"key":"e_1_3_2_1_29_1","volume-title":"Improving Cross-Modal Retrieval with Set of Diverse Embeddings. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR","author":"Kim Dongwon","year":"2023","unstructured":"Dongwon Kim, Namyup Kim, and Suha Kwak. 2023. Improving Cross-Modal Retrieval with Set of Diverse Embeddings. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2023. 23422--23431."},{"key":"e_1_3_2_1_30_1","volume-title":"En-Compactness: Self-Distillation Embedding & Contrastive Generation for Generalized Zero-Shot Learning. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR","author":"Kong Xia","year":"2022","unstructured":"Xia Kong, Zuodong Gao, Xiaofan Li, Ming Hong, Jun Liu, Chengjie Wang, Yuan Xie, and Yanyun Qu. 2022. En-Compactness: Self-Distillation Embedding & Contrastive Generation for Generalized Zero-Shot Learning. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2022. 9296--9305."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206594"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.140"},{"key":"e_1_3_2_1_33_1","volume-title":"Text-Adaptive Multiple Visual Prototype Matching for Video-Text Retrieval. In Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems, NeurIPS","author":"Lin Chengzhi","year":"2022","unstructured":"Chengzhi Lin, Ancong Wu, Junwei Liang, Jun Zhang, Wenhang Ge, Wei-Shi Zheng, and Chunhua Shen. 2022. Text-Adaptive Multiple Visual Prototype Matching for Video-Text Retrieval. In Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems, NeurIPS 2022."},{"key":"e_1_3_2_1_34_1","volume-title":"Progressive Semantic-Visual Mutual Adaption for Generalized Zero-Shot Learning. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR","author":"Liu Man","year":"2023","unstructured":"Man Liu, Feng Li, Chunjie Zhang, Yunchao Wei, Huihui Bai, and Yao Zhao. 2023. Progressive Semantic-Visual Mutual Adaption for Generalized Zero-Shot Learning. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2023. 15337--15346."},{"key":"e_1_3_2_1_35_1","volume-title":"Object-Centric Learning with Slot Attention. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems, NeurIPS","author":"Locatello Francesco","year":"2020","unstructured":"Francesco Locatello, Dirk Weissenborn, Thomas Unterthiner, Aravindh Mahendran, Georg Heigold, Jakob Uszkoreit, Alexey Dosovitskiy, and Thomas Kipf. 2020. Object-Centric Learning with Slot Attention. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems, NeurIPS 2020."},{"key":"e_1_3_2_1_36_1","volume-title":"Advances in Neural Information Processing Systems 26: Annual Conference on Neural Information Processing Systems, NeurIPS","author":"Mikolov Tom\u00e1s","year":"2013","unstructured":"Tom\u00e1s Mikolov, Ilya Sutskever, Kai Chen, Gregory S. Corrado, and Jeffrey Dean. 2013. Distributed Representations of Words and Phrases and their Compositionality. In Advances in Neural Information Processing Systems 26: Annual Conference on Neural Information Processing Systems, NeurIPS 2013. 3111--3119."},{"key":"e_1_3_2_1_37_1","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition, CVPR","author":"Naeem M.","year":"2023","unstructured":"M. Naeem, M. Ali Khan, Y. Xian, M. Afzal, D. Stricker, L. Van Gool, and F. Tombari. 2023. I2MVFormer: Large Language Model Generated Multi-View Document Supervision for Zero-Shot Image Classification. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2023. 15169--15179."},{"key":"e_1_3_2_1_38_1","volume-title":"Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems, NeurIPS","author":"Naeem Muhammad Ferjad","year":"2022","unstructured":"Muhammad Ferjad Naeem, Yongqin Xian, Luc Van Gool, and Federico Tombari. 2022. I2DFormer: Learning Image to Document Attention for Zero-Shot Image Classification. In Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems, NeurIPS 2022."},{"key":"e_1_3_2_1_39_1","volume-title":"Learning Graph Embeddings for Compositional Zero-Shot Learning. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR","author":"Naeem Muhammad Ferjad","year":"2021","unstructured":"Muhammad Ferjad Naeem, Yongqin Xian, Federico Tombari, and Zeynep Akata. 2021. Learning Graph Embeddings for Compositional Zero-Shot Learning. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021. 953--962."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_2_1_41_1","volume-title":"Zero-shot Learning with Semantic Output Codes. In Advances in Neural Information Processing Systems 22: Annual Conference on Neural Information Processing Systems, NeurIPS","author":"Palatucci Mark","year":"2009","unstructured":"Mark Palatucci, Dean Pomerleau, Geoffrey E. Hinton, and Tom M. Mitchell. 2009. Zero-shot Learning with Semantic Output Codes. In Advances in Neural Information Processing Systems 22: Annual Conference on Neural Information Processing Systems, NeurIPS 2009. 1410--1418."},{"volume-title":"Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing, EMNLP 2014. 1532--1543","author":"Pennington Jeffrey","key":"e_1_3_2_1_42_1","unstructured":"Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. Glove: Global Vectors for Word Representation. In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing, EMNLP 2014. 1532--1543."},{"key":"e_1_3_2_1_43_1","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition, CVPR","author":"Qiao Ruizhi","year":"2016","unstructured":"Ruizhi Qiao, Lingqiao Liu, Chunhua Shen, and Anton van den Hengel. 2016. Less is More: Zero-Shot Learning from Online Textual Documents with Noise Suppression. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016. 2249--2257."},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of the 32nd International Conference on Machine Learning, ICML 2015.","volume":"37","author":"Romera-Paredes Bernardino","unstructured":"Bernardino Romera-Paredes and Philip H. S. Torr. 2015. An embarrassingly simple approach to zero-shot learning. In Proceedings of the 32nd International Conference on Machine Learning, ICML 2015., Vol. 37. 2152--2161."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1016\/0306-4573(88)90021-0"},{"key":"e_1_3_2_1_46_1","volume-title":"Zero-Shot Learning Through Cross-Modal Transfer. In Advances in Neural Information Processing Systems 26: Annual Conference on Neural Information Processing Systems, NeurIPS","author":"Socher Richard","year":"2013","unstructured":"Richard Socher, Milind Ganjoo, Christopher D. Manning, and Andrew Y. Ng. 2013. Zero-Shot Learning Through Cross-Modal Transfer. In Advances in Neural Information Processing Systems 26: Annual Conference on Neural Information Processing Systems, NeurIPS 2013. 935--943."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01240-3_29"},{"key":"e_1_3_2_1_48_1","volume-title":"MPNet: Masked and Permuted Pre-training for Language Understanding. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems, NeurIPS","author":"Song Kaitao","year":"2020","unstructured":"Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, and Tie-Yan Liu. 2020. MPNet: Masked and Permuted Pre-training for Language Understanding. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems, NeurIPS 2020."},{"key":"e_1_3_2_1_49_1","volume-title":"Polysemous Visual-Semantic Embedding for Cross-Modal Retrieval. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2019. 1979","author":"Song Yale","year":"2019","unstructured":"Yale Song and Mohammad Soleymani. 2019. Polysemous Visual-Semantic Embedding for Cross-Modal Retrieval. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2019. 1979--1988."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00773"},{"key":"e_1_3_2_1_51_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton-Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aur\u00e9lien Rodriguez Robert Stojnic Sergey Edunov and Thomas Scialom. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. CoRR Vol. abs\/2307.09288 (2023)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00450"},{"key":"e_1_3_2_1_53_1","volume-title":"The Caltech-UCSD Birds-200--2011 Dataset. california institute of technology","author":"Wah Catherine","year":"2011","unstructured":"Catherine Wah, Steve Branson, Peter Welinder, Pietro Perona, and Serge Belongie. 2011. The Caltech-UCSD Birds-200--2011 Dataset. california institute of technology (2011)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00717"},{"key":"e_1_3_2_1_55_1","unstructured":"Website. 2001. Wikipedia. https:\/\/en.wikipedia.org\/."},{"key":"e_1_3_2_1_56_1","unstructured":"Website. 2020. A-Z Animals. https:\/\/a-z-animals.com\/."},{"key":"e_1_3_2_1_57_1","unstructured":"Website. 2022. All About Birds. https:\/\/www.aboutbirds.org\/."},{"key":"e_1_3_2_1_58_1","volume-title":"Latent Embeddings for Zero-Shot Classification. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR","author":"Xian Yongqin","year":"2016","unstructured":"Yongqin Xian, Zeynep Akata, Gaurav Sharma, Quynh Nguyen, Matthias Hein, and Bernt Schiele. 2016. Latent Embeddings for Zero-Shot Classification. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016. 69--77."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2857768"},{"key":"e_1_3_2_1_60_1","volume-title":"Feature Generating Networks for Zero-Shot Learning. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR","author":"Xian Yongqin","year":"2018","unstructured":"Yongqin Xian, Tobias Lorenz, Bernt Schiele, and Zeynep Akata. 2018. Feature Generating Networks for Zero-Shot Learning. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2018. 5542--5551."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01052"},{"key":"e_1_3_2_1_62_1","volume-title":"Attribute Prototype Network for Zero-Shot Learning. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems, NeurIPS","author":"Xu Wenjia","year":"2020","unstructured":"Wenjia Xu, Yongqin Xian, Jiuniu Wang, Bernt Schiele, and Zeynep Akata. 2020. Attribute Prototype Network for Zero-Shot Learning. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems, NeurIPS 2020."},{"key":"e_1_3_2_1_63_1","volume-title":"VGSE: Visually-Grounded Semantic Embeddings for Zero-Shot Learning. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR","author":"Xu Wenjia","year":"2022","unstructured":"Wenjia Xu, Yongqin Xian, Jiuniu Wang, Bernt Schiele, and Zeynep Akata. 2022. VGSE: Visually-Grounded Semantic Embeddings for Zero-Shot Learning. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2022. 9306--9315."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.4"},{"key":"e_1_3_2_1_65_1","volume-title":"Designing Category-Level Attributes for Discriminative Visual Recognition. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR","author":"Yu Felix X.","year":"2013","unstructured":"Felix X. Yu, Liangliang Cao, Rog\u00e9rio Schmidt Feris, John R. Smith, and Shih-Fu Chang. 2013. Designing Category-Level Attributes for Discriminative Visual Recognition. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2013. 771--778."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611764"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612104"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00111"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00994"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680829","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680829","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:07Z","timestamp":1750295887000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680829"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":69,"alternative-id":["10.1145\/3664647.3680829","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680829","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}