{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:04:21Z","timestamp":1765357461837,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":69,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100017052","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62006140)"],"award-info":[{"award-number":["62006140)"]}],"id":[{"id":"10.13039\/100017052","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shandong Provincial Natural Science Foundation","award":["ZR2020QF106"],"award-info":[{"award-number":["ZR2020QF106"]}]},{"name":"Shenzhen Higher Education Institutions Stable Support Program (Key Project)","award":["GXWD20220817123150002"],"award-info":[{"award-number":["GXWD20220817123150002"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612129","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"2221-2232","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Mask Again: Masked Knowledge Distillation for Masked Video Modeling"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6449-2727","authenticated-orcid":false,"given":"Xiaojie","family":"Li","sequence":"first","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen &amp; Peng Cheng Laboratory, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7214-0921","authenticated-orcid":false,"given":"Shaowei","family":"He","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0247-5221","authenticated-orcid":false,"given":"Jianlong","family":"Wu","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9865-2212","authenticated-orcid":false,"given":"Yue","family":"Yu","sequence":"additional","affiliation":[{"name":"Peng Cheng Laboratory, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1476-0273","authenticated-orcid":false,"given":"Liqiang","family":"Nie","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3895-5510","authenticated-orcid":false,"given":"Min","family":"Zhang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02323"},{"key":"e_1_3_2_1_3_1","volume-title":"Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254","author":"Bao Hangbo","year":"2021","unstructured":"Hangbo Bao, Li Dong, Songhao Piao, and Furu Wei. 2021. Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254 (2021)."},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the International Conference on Machine Learning.","author":"Bertasius Gedas","year":"2021","unstructured":"Gedas Bertasius, Heng Wang, and Lorenzo Torresani. 2021. Is Space-Time Attention All You Need for Video Understanding?. In Proceedings of the International Conference on Machine Learning."},{"key":"e_1_3_2_1_5_1","volume-title":"Swathikiran Sudhakaran, Brais Martinez, and Georgios Tzimiropoulos.","author":"Bulat Adrian","year":"2021","unstructured":"Adrian Bulat, Juan Manuel Perez Rua, Swathikiran Sudhakaran, Brais Martinez, and Georgios Tzimiropoulos. 2021. Space-time mixing attention for video transformer. Advances in neural information processing systems, Vol. 34 (2021), 19594--19607."},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the International Conference on Machine Learning. PMLR, 1691--1703","author":"Chen Mark","year":"2020","unstructured":"Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, and Ilya Sutskever. 2020. Generative pretraining from pixels. In Proceedings of the International Conference on Machine Learning. PMLR, 1691--1703."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00359"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00153"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547783"},{"key":"e_1_3_2_1_10_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"e_1_3_2_1_13_1","volume-title":"Masked Autoencoders As Spatiotemporal Learners. Advances in Neural Information Processing Systems","author":"Feichtenhofer Christoph","year":"2022","unstructured":"Christoph Feichtenhofer, Haoqi Fan, Yanghao Li, and Kaiming He. 2022. Masked Autoencoders As Spatiotemporal Learners. Advances in Neural Information Processing Systems (2022)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00331"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00331"},{"key":"e_1_3_2_1_17_1","volume-title":"Armand Joulin, and Ishan Misra.","author":"Girdhar Rohit","year":"2022","unstructured":"Rohit Girdhar, Alaaeldin El-Nouby, Mannat Singh, Kalyan Vasudev Alwala, Armand Joulin, and Ishan Misra. 2022. OmniMAE: Single Model Masked Pretraining on Images and Videos. arXiv (2022)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00186"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_19"},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"He Kaiming","year":"2021","unstructured":"Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Doll\u00e1r, and Ross Girshick. 2021. Masked Autoencoders Are Scalable Vision Learners. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2021), 15979--15988."},{"key":"e_1_3_2_1_22_1","unstructured":"Geoffrey Hinton Oriol Vinyals Jeff Dean et al. 2015. Distilling the knowledge in a neural network. arXiv (2015)."},{"key":"e_1_3_2_1_23_1","volume-title":"Improving neural networks by preventing co-adaptation of feature detectors. arXiv","author":"Hinton Geoffrey E","year":"2012","unstructured":"Geoffrey E Hinton, Nitish Srivastava, Alex Krizhevsky, Ilya Sutskever, and Ruslan R Salakhutdinov. 2012. Improving neural networks by preventing co-adaptation of feature detectors. arXiv (2012)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00784"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_39"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01535"},{"key":"e_1_3_2_1_27_1","volume-title":"TinyBERT: Distilling BERT for Natural Language Understanding. In Findings of the Conference on Empirical Methods in Natural Language Processing. 4163--4174","author":"Jiao Xiaoqi","year":"2020","unstructured":"Xiaoqi Jiao, Yichun Yin, Lifeng Shang, Xin Jiang, Xiao Chen, Linlin Li, Fang Wang, and Qun Liu. 2020. TinyBERT: Distilling BERT for Natural Language Understanding. In Findings of the Conference on Empirical Methods in Natural Language Processing. 4163--4174."},{"key":"e_1_3_2_1_28_1","unstructured":"Will Kay Joao Carreira Karen Simonyan Brian Zhang Chloe Hillier Sudheendra Vijayanarasimhan Fabio Viola Tim Green Trevor Back Paul Natsev et al. 2017. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"e_1_3_2_1_30_1","volume-title":"UniFormer: Unified Transformer for Efficient Spatial-Temporal Representation Learning. In International Conference on Learning Representations. OpenReview.net.","author":"Li Kunchang","year":"2022","unstructured":"Kunchang Li, Yali Wang, Gao Peng, Guanglu Song, Yu Liu, Hongsheng Li, and Yu Qiao. 2022. UniFormer: Unified Transformer for Efficient Spatial-Temporal Representation Learning. In International Conference on Learning Representations. OpenReview.net."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58610-2_2"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00718"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00726"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01345"},{"key":"e_1_3_2_1_37_1","volume-title":"Sgdr: Stochastic gradient descent with warm restarts. arXiv","author":"Loshchilov Ilya","year":"2016","unstructured":"Ilya Loshchilov and Frank Hutter. 2016. Sgdr: Stochastic gradient descent with warm restarts. arXiv (2016)."},{"key":"e_1_3_2_1_38_1","unstructured":"Ilya Loshchilov and Frank Hutter. 2018. Fixing weight decay regularization in adam. (2018)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00355"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548180"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01105"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00409"},{"key":"e_1_3_2_1_43_1","volume-title":"Christoph Feichtenhofer, Andrea Vedaldi, and Joao F. Henriques.","author":"Patrick Mandela","year":"2021","unstructured":"Mandela Patrick, Dylan Campbell, Yuki M. Asano, Ishan Misra Florian Metze, Christoph Feichtenhofer, Andrea Vedaldi, and Joao F. Henriques. 2021. Keeping Your Eye on the Ball: Trajectory Attention in Video Transformers. In Advances in Neural Information Processing Systems. 12493--12506."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Baoyun Peng Xiao Jin Jiaheng Liu Shunfeng Zhou Yichao Wu Yu Liu Dongsheng Li and Zhaoning Zhang. 2019. Correlation Congruence for Knowledge Distillation. (2019) 5006--5015.","DOI":"10.1109\/ICCV.2019.00511"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00689"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01233"},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of the International Conference on Machine Learning.","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In Proceedings of the International Conference on Machine Learning."},{"key":"e_1_3_2_1_48_1","volume-title":"Parameter-efficient and student-friendly knowledge distillation. arXiv","author":"Rao Jun","year":"2022","unstructured":"Jun Rao, Xv Meng, Liang Ding, Shuhan Qi, and Dacheng Tao. 2022. Parameter-efficient and student-friendly knowledge distillation. arXiv (2022)."},{"key":"e_1_3_2_1_49_1","volume-title":"FitNets: Hints for Thin Deep Nets. In International Conference on Learning Representations.","author":"Romero Adriana","year":"2015","unstructured":"Adriana Romero, Nicolas Ballas, Samira Ebrahimi Kahou, Antoine Chassang, Carlo Gatta, and Yoshua Bengio. 2015. FitNets: Hints for Thin Deep Nets. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01025"},{"key":"e_1_3_2_1_51_1","volume-title":"Amir Roshan Zamir, and Mubarak Shah","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv (2012)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.195"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_2_1_54_1","volume-title":"VIMPAC: Video Pre-Training via Masked Token Prediction and Contrastive Learning. arXiv","author":"Tan Hao","year":"2021","unstructured":"Hao Tan, Jie Lei, Thomas Wolf, and Mohit Bansal. 2021. VIMPAC: Video Pre-Training via Masked Token Prediction and Contrastive Learning. arXiv (2021)."},{"key":"e_1_3_2_1_55_1","volume-title":"Contrastive Representation Distillation. In International Conference on Learning Representations. OpenReview.net.","author":"Tian Yonglong","year":"2020","unstructured":"Yonglong Tian, Dilip Krishnan, and Phillip Isola. 2020. Contrastive Representation Distillation. In International Conference on Learning Representations. OpenReview.net."},{"key":"e_1_3_2_1_56_1","volume-title":"Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in neural information processing systems","author":"Tong Zhan","year":"2022","unstructured":"Zhan Tong, Yibing Song, Jue Wang, and Limin Wang. 2022. Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in neural information processing systems, Vol. 35 (2022), 10078--10093."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_19"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00193"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01432"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00611"},{"key":"e_1_3_2_1_61_1","first-page":"5776","article-title":"Minilm: Deep self-attention distillation for task-agnostic compression of pre-trained transformers","volume":"33","author":"Wang Wenhui","year":"2020","unstructured":"Wenhui Wang, Furu Wei, Li Dong, Hangbo Bao, Nan Yang, and Ming Zhou. 2020. Minilm: Deep self-attention distillation for task-agnostic compression of pre-trained transformers. Advances in Neural Information Processing Systems, Vol. 33 (2020), 5776--5788.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01426"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3265261"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00943"},{"key":"e_1_3_2_1_65_1","volume-title":"Stare at What You See: Masked Image Modeling without Reconstruction. arXiv","author":"Xue Hongwei","year":"2022","unstructured":"Hongwei Xue, Peng Gao, Hongyang Li, Yu Qiao, Hao Sun, Houqiang Li, and Jiebo Luo. 2022. Stare at What You See: Masked Image Modeling without Reconstruction. arXiv (2022)."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00612"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01389"},{"key":"e_1_3_2_1_68_1","volume-title":"Paying more attention to attention: Improving the performance of convolutional neural networks via attention transfer. arXiv","author":"Zagoruyko Sergey","year":"2016","unstructured":"Sergey Zagoruyko and Nikos Komodakis. 2016. Paying more attention to attention: Improving the performance of convolutional neural networks via attention transfer. arXiv (2016)."},{"key":"e_1_3_2_1_69_1","volume-title":"International Conference on Learning Representations. OpenReview.net.","author":"Zhang Hongyi","year":"2018","unstructured":"Hongyi Zhang, Moustapha Cisse, Yann N Dauphin, and David Lopez-Paz. 2018. mixup: Beyond empirical risk minimization. In International Conference on Learning Representations. OpenReview.net."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612129","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612129","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:02:50Z","timestamp":1755820970000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612129"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":69,"alternative-id":["10.1145\/3581783.3612129","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612129","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}