{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,5]],"date-time":"2026-01-05T22:19:16Z","timestamp":1767651556010,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62176139"],"award-info":[{"award-number":["62176139"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Major basic research project of Shandong Natural Science Foundation","award":["ZR2021ZD15"],"award-info":[{"award-number":["ZR2021ZD15"]}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022YFC3302802"],"award-info":[{"award-number":["2022YFC3302802"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"name":"The Fundamental Research Funds of Shandong University"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612104","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"3161-3171","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["M3R: Masked Token Mixup and Cross-Modal Reconstruction for Zero-Shot Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2666-0299","authenticated-orcid":false,"given":"Peng","family":"Zhao","sequence":"first","affiliation":[{"name":"Shandong University, Jinan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0416-8778","authenticated-orcid":false,"given":"Qiangchang","family":"Wang","sequence":"additional","affiliation":[{"name":"Shandong University, Jinan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8465-1294","authenticated-orcid":false,"given":"Yilong","family":"Yin","sequence":"additional","affiliation":[{"name":"Shandong University, Jinan, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Yashas Annadani and Soma Biswas. 2018. Preserving semantic relations for zero-shot learning. In CVPR. 7603--7612."},{"key":"e_1_3_2_1_2_1","volume-title":"Multimae: Multi-modal multi-task masked autoencoders","author":"Bachmann Roman","year":"2022","unstructured":"Roman Bachmann, David Mizrahi, Andrei Atanov, and Amir Zamir. 2022. Multimae: Multi-modal multi-task masked autoencoders. In ECCV. Springer, 348--367."},{"key":"e_1_3_2_1_3_1","unstructured":"Alexei Baevski Wei-Ning Hsu Qiantong Xu Arun Babu Jiatao Gu and Michael Auli. 2022. Data2vec: A general framework for self-supervised learning in speech vision and language. In ICML. PMLR 1298--1312."},{"key":"e_1_3_2_1_4_1","volume-title":"Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254","author":"Bao Hangbo","year":"2021","unstructured":"Hangbo Bao, Li Dong, Songhao Piao, and Furu Wei. 2021. Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254 (2021)."},{"key":"e_1_3_2_1_5_1","volume-title":"How to understand masked autoencoders. arXiv preprint arXiv:2202.03670","author":"Cao Shuhao","year":"2022","unstructured":"Shuhao Cao, Peng Xu, and David A Clifton. 2022. How to understand masked autoencoders. arXiv preprint arXiv:2202.03670 (2022)."},{"key":"e_1_3_2_1_6_1","volume-title":"Torr","author":"Chen Dubing","year":"2022","unstructured":"Dubing Chen, Yuming Shen, Haofeng Zhang, and Philip H.S. Torr. 2022d. Zero-Shot Logit Adjustment. In IJCAI. 813--819."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Jie-Neng Chen Shuyang Sun Ju He Philip HS Torr Alan Yuille and Song Bai. 2022 e. Transmix: Attend to mix for vision transformers. In CVPR. 12135--12144.","DOI":"10.1109\/CVPR52688.2022.01182"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Shiming Chen Ziming Hong Yang Liu Guo-Sen Xie Baigui Sun Hao Li Qinmu Peng Ke Lu and Xinge You. 2022b. TransZero: Attribute-guided Transformer for Zero-Shot Learning. In AAAI.","DOI":"10.1609\/aaai.v36i1.19909"},{"key":"e_1_3_2_1_9_1","volume-title":"MSDN: Mutually Semantic Distillation Network for Zero-Shot Learning. In CVPR. 7612--7621.","author":"Chen Shiming","year":"2022","unstructured":"Shiming Chen, Ziming Hong, Guo-Sen Xie, Wenhan Yang, Qinmu Peng, Kai Wang, Jian Zhao, and Xinge You. 2022c. MSDN: Mutually Semantic Distillation Network for Zero-Shot Learning. In CVPR. 7612--7621."},{"key":"e_1_3_2_1_10_1","first-page":"16622","article-title":"Hsva: Hierarchical semantic-visual adaptation for zero-shot learning","volume":"34","author":"Chen Shiming","year":"2021","unstructured":"Shiming Chen, GuoSen Xie, Yang Liu, Qinmu Peng, Baigui Sun, Hao Li, Xinge You, and Ling Shao. 2021. Hsva: Hierarchical semantic-visual adaptation for zero-shot learning. NeurIPS, Vol. 34 (2021), 16622--16634.","journal-title":"NeurIPS"},{"volume-title":"Multi-modal Masked Autoencoders for Medical Vision-and-Language Pre-training","author":"Chen Zhihong","key":"e_1_3_2_1_11_1","unstructured":"Zhihong Chen, Yuhao Du, Jinpeng Hu, Yang Liu, Guanbin Li, Xiang Wan, and Tsung-Hui Chang. 2022a. Multi-modal Masked Autoencoders for Medical Vision-and-Language Pre-training. In MICCAI. Springer, 679--689."},{"key":"e_1_3_2_1_12_1","volume-title":"Kim","author":"Choi Hyeong Kyu","year":"2022","unstructured":"Hyeong Kyu Choi, Joonmyung Choi, and Hyunwoo J. Kim. 2022. TokenMixup: Efficient Attention-guided Token-level Data Augmentation for Transformers. In NeurIPS."},{"key":"e_1_3_2_1_13_1","unstructured":"Yu-Ying Chou Hsuan-Tien Lin and Tyng-Luh Liu. 2021. Adaptive and generative zero-shot learning. In ICLR."},{"key":"e_1_3_2_1_14_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In ICLR."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Yaogong Feng Xiaowen Huang Pengbo Yang Jian Yu and Jitao Sang. 2022. Non-Generative Generalized Zero-Shot Learning via Task-Correlated Disentanglement and Controllable Samples Synthesis. In CVPR. 9346--9355.","DOI":"10.1109\/CVPR52688.2022.00913"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2408354"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Jiannan Ge Hongtao Xie Shaobo Min Pandeng Li and Yongdong Zhang. 2022. Dual Part Discovery Network for Zero-Shot Learning. In ACM MM. 3244--3252.","DOI":"10.1145\/3503161.3547889"},{"key":"e_1_3_2_1_18_1","volume-title":"Multimodal masked autoencoders learn transferable representations. arXiv preprint arXiv:2205.14204","author":"Geng Xinyang","year":"2022","unstructured":"Xinyang Geng, Hao Liu, Lisa Lee, Dale Schuurams, Sergey Levine, and Pieter Abbeel. 2022. Multimodal masked autoencoders learn transferable representations. arXiv preprint arXiv:2205.14204 (2022)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Zongyan Han Zhenyong Fu Shuo Chen and Jian Yang. 2021. Contrastive embedding for generalized zero-shot learning. In CVPR. 2371--2381.","DOI":"10.1109\/CVPR46437.2021.00240"},{"key":"e_1_3_2_1_20_1","unstructured":"Kaiming He Xinlei Chen Saining Xie Yanghao Li Piotr Doll\u00e1r and Ross Girshick. 2022a. Masked autoencoders are scalable vision learners. In CVPR. 16000--16009."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547815"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2209.07837"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Dat Huynh and Ehsan Elhamifar. 2020. Fine-grained generalized zero-shot learning via dense attribute-based attention. In CVPR. 4483--4493.","DOI":"10.1109\/CVPR42600.2020.00454"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20056-4_18"},{"key":"e_1_3_2_1_25_1","unstructured":"Jang Hyun Kim Wonho Choo Hosan Jeong and Hyun Oh Song. 2021. Co-Mixup: Saliency Guided Joint Mixup with Supermodular Diversity. In ICLR."},{"key":"e_1_3_2_1_26_1","unstructured":"Jang-Hyun Kim Wonho Choo and Hyun Oh Song. 2020. Puzzle mix: Exploiting saliency and local statistics for optimal mixup. In ICML. PMLR 5275--5285."},{"key":"e_1_3_2_1_27_1","volume-title":"Adam: A Method for Stochastic Optimization. In ICLR (Poster).","author":"Kingma Diederik P","year":"2015","unstructured":"Diederik P Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In ICLR (Poster)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Xia Kong Zuodong Gao Xiaofan Li Ming Hong Jun Liu Chengjie Wang Yuan Xie and Yanyun Qu. 2022. En-Compactness: Self-Distillation Embedding & Contrastive Generation for Generalized Zero-Shot Learning. In CVPR. 9306--9315.","DOI":"10.1109\/CVPR52688.2022.00909"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.140"},{"key":"e_1_3_2_1_30_1","unstructured":"Gang Li Heliang Zheng Daqing Liu Chaoyue Wang Bing Su and Changwen Zheng. 2022. SemMAE: Semantic-Guided Masking for Learning Masked Autoencoders. In NeuraIPS."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Jingjing Li Mengmeng Jing Ke Lu Lei Zhu Yang Yang and Zi Huang. 2019. Alleviating feature confusion for generative zero-shot learning. In ACM MM. 1587--1595.","DOI":"10.1145\/3343031.3350901"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Yan Li Junge Zhang Jianguo Zhang and Kaiqi Huang. 2018. Discriminative learning of latent features for zero-shot recognition. In CVPR. 7463--7471.","DOI":"10.1109\/CVPR.2018.00779"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Yang Liu Jishun Guo Deng Cai and Xiaofei He. 2019. Attribute attention for semantic disambiguation in zero-shot learning. In ICCV. 6698--6707.","DOI":"10.1109\/ICCV.2019.00680"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Yang Liu Lei Zhou Xiao Bai Yifei Huang Lin Gu Jun Zhou and Tatsuya Harada. 2021. Goal-oriented gaze estimation for zero-shot learning. In CVPR. 3794--3803.","DOI":"10.1109\/CVPR46437.2021.00379"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"volume-title":"Sun attribute database: Discovering, annotating, and recognizing scene attributes","author":"Patterson Genevieve","key":"e_1_3_2_1_36_1","unstructured":"Genevieve Patterson and James Hays. 2012. Sun attribute database: Discovering, annotating, and recognizing scene attributes. In CVPR. IEEE, 2751--2758."},{"key":"e_1_3_2_1_37_1","unstructured":"Viraj Uday Prabhu Sriram Yenamandra Aaditya Singh and Judy Hoffman. 2022. Adapting Self-Supervised Vision Transformers by Probing Attention-Conditioned Masking Consistency. In NeuraIPS."},{"key":"e_1_3_2_1_38_1","volume-title":"Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1).","author":"Ridnik Tal","year":"2021","unstructured":"Tal Ridnik, Emanuel Ben-Baruch, Asaf Noy, and Lihi Zelnik-Manor. 2021. ImageNet-21K Pretraining for the Masses. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)."},{"key":"e_1_3_2_1_39_1","unstructured":"Hongzu Su Jingjing Li Zhi Chen Lei Zhu and Ke Lu. 2022. Distinguishing unseen from seen for generalized zero-shot learning. In CVPR. 7885--7894."},{"key":"e_1_3_2_1_40_1","unstructured":"Vikas Verma Alex Lamb Christopher Beckham Amir Najafi Ioannis Mitliagkas David Lopez-Paz and Yoshua Bengio. 2019. Manifold mixup: Better representations by interpolating hidden states. In ICML. PMLR 6438--6447."},{"key":"e_1_3_2_1_41_1","volume-title":"The caltech-ucsd birds-200--2011 dataset","author":"Wah Catherine","year":"2011","unstructured":"Catherine Wah, Steve Branson, Peter Welinder, Pietro Perona, and Serge Belongie. 2011. The caltech-ucsd birds-200--2011 dataset. California Institute of Technology (2011), 1--8."},{"key":"e_1_3_2_1_42_1","volume-title":"NeurIPS","volume":"34","author":"Wang Chaoqun","year":"2021","unstructured":"Chaoqun Wang, Shaobo Min, Xuejin Chen, Xiaoyan Sun, and Houqiang Li. 2021. Dual Progressive Prototype Network for Generalized Zero-Shot Learning. NeurIPS, Vol. 34 (2021)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Yongqin Xian Tobias Lorenz Bernt Schiele and Zeynep Akata. 2018. Feature generating networks for zero-shot learning. In CVPR. 5542--5551.","DOI":"10.1109\/CVPR.2018.00581"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Yongqin Xian Saurabh Sharma Bernt Schiele and Zeynep Akata. 2019. f-vaegan-d2: A feature generating framework for any-shot learning. In CVPR. 10275--10284.","DOI":"10.1109\/CVPR.2019.01052"},{"volume-title":"Region graph embedding network for zero-shot learning","author":"Xie Guo-Sen","key":"e_1_3_2_1_45_1","unstructured":"Guo-Sen Xie, Li Liu, Fan Zhu, Fang Zhao, Zheng Zhang, Yazhou Yao, Jie Qin, and Ling Shao. 2020. Region graph embedding network for zero-shot learning. In ECCV. Springer, 562--580."},{"key":"e_1_3_2_1_46_1","volume-title":"Simmim: A simple framework for masked image modeling. In CVPR. 9653--9663.","author":"Xie Zhenda","year":"2022","unstructured":"Zhenda Xie, Zheng Zhang, Yue Cao, Yutong Lin, Jianmin Bao, Zhuliang Yao, Qi Dai, and Han Hu. 2022. Simmim: A simple framework for masked image modeling. In CVPR. 9653--9663."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2022.3142181"},{"key":"e_1_3_2_1_48_1","first-page":"21969","article-title":"Attribute prototype network for zero-shot learning","volume":"33","author":"Xu Wenjia","year":"2020","unstructured":"Wenjia Xu, Yongqin Xian, Jiuniu Wang, Bernt Schiele, and Zeynep Akata. 2020. Attribute prototype network for zero-shot learning. NeurIPS, Vol. 33 (2020), 21969--21980.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2022.3186512"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i9.26285"},{"key":"e_1_3_2_1_51_1","unstructured":"Zhongqi Yue Tan Wang Hanwang Zhang Qianru Sun and Xian-Sheng Hua. 2021. Counterfactual Zero-Shot and Open-Set Visual Recognition. In CVPR."},{"key":"e_1_3_2_1_52_1","volume-title":"Sanghyuk Chun, Junsuk Choe, and Youngjoon Yoo.","author":"Yun Sangdoo","year":"2019","unstructured":"Sangdoo Yun, Dongyoon Han, Seong Joon Oh, Sanghyuk Chun, Junsuk Choe, and Youngjoon Yoo. 2019. Cutmix: Regularization strategy to train strong classifiers with localizable features. In ICCV. 6023--6032."},{"key":"e_1_3_2_1_53_1","volume-title":"International Conference on Learning Representations.","author":"Zhang Hongyi","year":"2018","unstructured":"Hongyi Zhang, Moustapha Cisse, Yann N Dauphin, and David Lopez-Paz. 2018. mixup: Beyond Empirical Risk Minimization. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_54_1","volume-title":"iBOT: Image BERT Pre-Training with Online Tokenizer. ICLR","author":"Zhou Jinghao","year":"2022","unstructured":"Jinghao Zhou, Chen Wei, Huiyu Wang, Wei Shen, Cihang Xie, Alan Yuille, and Tao Kong. 2022a. iBOT: Image BERT Pre-Training with Online Tokenizer. ICLR (2022)."},{"key":"e_1_3_2_1_55_1","volume-title":"Mimco: Masked image modeling pre-training with contrastive teacher. In ACM MM. 4487--4495.","author":"Zhou Qiang","year":"2022","unstructured":"Qiang Zhou, Chaohui Yu, Hao Luo, Zhibin Wang, and Hao Li. 2022b. Mimco: Masked image modeling pre-training with contrastive teacher. In ACM MM. 4487--4495."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612104","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612104","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:01:25Z","timestamp":1755820885000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612104"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":55,"alternative-id":["10.1145\/3581783.3612104","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612104","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}