{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,21]],"date-time":"2026-06-21T10:48:31Z","timestamp":1782038911966,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611756","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"3847-3856","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Mask to Reconstruct: Cooperative Semantics Completion for Video-text Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4379-2971","authenticated-orcid":false,"given":"Han","family":"Fang","sequence":"first","affiliation":[{"name":"China Telecom Corporation Ltd. Data&amp;AI Technology Company, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0054-8896","authenticated-orcid":false,"given":"Zhifei","family":"Yang","sequence":"additional","affiliation":[{"name":"China Telecom Corporation Ltd. Data&amp;AI Technology Company, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8421-7167","authenticated-orcid":false,"given":"Xianghao","family":"Zang","sequence":"additional","affiliation":[{"name":"China Telecom Corporation Ltd. Data&amp;AI Technology Company, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8114-103X","authenticated-orcid":false,"given":"Chao","family":"Ban","sequence":"additional","affiliation":[{"name":"China Telecom Corporation Ltd. Data&amp;AI Technology Company, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1835-9271","authenticated-orcid":false,"given":"Zhongjiang","family":"He","sequence":"additional","affiliation":[{"name":"China Telecom Corporation Ltd. Data&amp;AI Technology Company, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7917-1628","authenticated-orcid":false,"given":"Hao","family":"Sun","sequence":"additional","affiliation":[{"name":"China Telecom Corporation Ltd. Data&amp;AI Technology Company, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7003-287X","authenticated-orcid":false,"given":"Lanxiang","family":"Zhou","sequence":"additional","affiliation":[{"name":"China Telecom Corporation Ltd. Data&amp;AI Technology Company, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"e_1_3_2_1_3_1","volume-title":"Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval. In IEEE\/CVF International Conference on Computer Vision (ICCV). 1708--1718","author":"Bain Max","year":"2021","unstructured":"Max Bain, Arsha Nagrani, G\u00fcl Varol, and Andrew Zisserman. 2021. Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval. In IEEE\/CVF International Conference on Computer Vision (ICCV). 1708--1718."},{"key":"e_1_3_2_1_4_1","volume-title":"Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254","author":"Bao Hangbo","year":"2021","unstructured":"Hangbo Bao, Li Dong, Songhao Piao, and Furu Wei. 2021. Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254 (2021)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.5555\/2002472.2002497"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"e_1_3_2_1_8_1","volume-title":"Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:2109.04290","author":"Cheng Xing","year":"2021","unstructured":"Xing Cheng, Hezheng Lin, Xiangyu Wu, Fan Yang, and Dong Shen. 2021. Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:2109.04290 (2021)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01138"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the International Conference on Learning Representations. 1--22","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, and Sylvain Gelly. 2020. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In Proceedings of the International Conference on Learning Representations. 1--22."},{"key":"e_1_3_2_1_11_1","volume-title":"MDMMT: Multidomain Multimodal Transformer for Video Retrieval. In IEEE Conference on Computer Vision and Pattern Recognition Workshops. 3354--3363","author":"Dzabraev Maksim","year":"2021","unstructured":"Maksim Dzabraev, Maksim Kalashnikov, Stepan Komkov, and Aleksandr Petiushko. 2021. MDMMT: Multidomain Multimodal Transformer for Video Retrieval. In IEEE Conference on Computer Vision and Pattern Recognition Workshops. 3354--3363."},{"key":"e_1_3_2_1_12_1","volume-title":"Jamie Ryan Kiros, and Sanja Fidler","author":"Faghri Fartash","year":"2017","unstructured":"Fartash Faghri, David J Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2017. Vse: Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612 (2017)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3227416"},{"key":"e_1_3_2_1_14_1","unstructured":"Christoph Feichtenhofer Yanghao Li Kaiming He et al. 2022. Masked autoencoders as spatiotemporal learners. Advances in neural information processing systems Vol. 35 (2022) 35946--35958."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"e_1_3_2_1_16_1","volume-title":"International conference on machine learning. PMLR, 1180--1189","author":"Ganin Yaroslav","year":"2015","unstructured":"Yaroslav Ganin and Victor Lempitsky. 2015. Unsupervised domain adaptation by backpropagation. In International conference on machine learning. PMLR, 1180--1189."},{"key":"e_1_3_2_1_17_1","volume-title":"Clip-adapter: Better vision-language models with feature adapters. arXiv preprint arXiv:2110.04544","author":"Gao Peng","year":"2021","unstructured":"Peng Gao, Shijie Geng, Renrui Zhang, Teli Ma, Rongyao Fang, Yongfeng Zhang, Hongsheng Li, and Yu Qiao. 2021. Clip-adapter: Better vision-language models with feature adapters. arXiv preprint arXiv:2110.04544 (2021)."},{"key":"e_1_3_2_1_18_1","volume-title":"Tel Aviv","author":"Ge Yuying","year":"2022","unstructured":"Yuying Ge, Yixiao Ge, Xihui Liu, Jinpeng Wang, Jianping Wu, Ying Shan, Xiaohu Qie, and Ping Luo. 2022. Miles: visual bert pre-training with injected language semantics for video-text retrieval. In Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXXV. Springer, 691--708."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00495"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"e_1_3_2_1_21_1","volume-title":"Milan: Masked image pretraining on language assisted representation. arXiv preprint arXiv:2208.06049","author":"Hou Zejiang","year":"2022","unstructured":"Zejiang Hou, Fei Sun, Yen-Kuang Chen, Yuan Xie, and Sun-Yuan Kung. 2022. Milan: Masked image pretraining on language assisted representation. arXiv preprint arXiv:2208.06049 (2022)."},{"key":"e_1_3_2_1_22_1","volume-title":"International Conference on Machine Learning. PMLR, 4904--4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International Conference on Machine Learning. PMLR, 4904--4916."},{"key":"e_1_3_2_1_23_1","volume-title":"Tel Aviv","author":"Kakogeorgiou Ioannis","year":"2022","unstructured":"Ioannis Kakogeorgiou, Spyros Gidaris, Bill Psomas, Yannis Avrithis, Andrei Bursuc, Konstantinos Karantzalos, and Nikos Komodakis. 2022. What to hide from your students: Attention-guided masked image modeling. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXX. Springer, 300--318."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"e_1_3_2_1_26_1","volume-title":"Revisiting Temporal Modeling for CLIP-based Image-to-Video Knowledge Transferring. arXiv preprint arXiv:2301.11116","author":"Liu Ruyang","year":"2023","unstructured":"Ruyang Liu, Jingjia Huang, Ge Li, Jiashi Feng, Xinglong Wu, and Thomas H Li. 2023. Revisiting Temporal Modeling for CLIP-based Image-to-Video Knowledge Transferring. arXiv preprint arXiv:2301.11116 (2023)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01170"},{"key":"e_1_3_2_1_28_1","volume-title":"Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487","author":"Liu Yang","year":"2019","unstructured":"Yang Liu, Samuel Albanie, Arsha Nagrani, and Andrew Zisserman. 2019. Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487 (2019)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19781-9_19"},{"key":"e_1_3_2_1_30_1","volume-title":"Univilm: A unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353","author":"Luo Huaishao","year":"2020","unstructured":"Huaishao Luo, Lei Ji, Botian Shi, Haoyang Huang, Nan Duan, Tianrui Li, Xilin Chen, and Ming Zhou. 2020. Univilm: A unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353 (2020)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547910"},{"key":"e_1_3_2_1_33_1","volume-title":"SimVTP: Simple Video Text Pre-training with Masked Autoencoders. arXiv preprint arXiv:2212.03490","author":"Ma Yue","year":"2022","unstructured":"Yue Ma, Tianyu Yang, Yin Shan, and Xiu Li. 2022b. SimVTP: Simple Video Text Pre-training with Masked Autoencoders. arXiv preprint arXiv:2212.03490 (2022)."},{"key":"e_1_3_2_1_34_1","volume-title":"Learning a text-video embedding from incomplete and heterogeneous data. arXiv preprint arXiv:1804.02516","author":"Miech Antoine","year":"2018","unstructured":"Antoine Miech, Ivan Laptev, and Josef Sivic. 2018. Learning a text-video embedding from incomplete and heterogeneous data. arXiv preprint arXiv:1804.02516 (2018)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3206025.3206064"},{"key":"e_1_3_2_1_37_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, and Jack Clark. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0987-1"},{"key":"e_1_3_2_1_39_1","volume-title":"Masked Contrastive Pre-Training for Efficient Video-Text Retrieval. arXiv preprint arXiv:2212.00986","author":"Shu Fangxun","year":"2022","unstructured":"Fangxun Shu, Biaolong Chen, Yue Liao, Shuwen Xiao, Wenyu Sun, Xiaobo Li, Yousong Zhu, Jinqiao Wang, and Si Liu. 2022. Masked Contrastive Pre-Training for Efficient Video-Text Retrieval. arXiv preprint arXiv:2212.00986 (2022)."},{"key":"e_1_3_2_1_40_1","volume-title":"Vimpac: Video pre-training via masked token prediction and contrastive learning. arXiv preprint arXiv:2106.11250","author":"Tan Hao","year":"2021","unstructured":"Hao Tan, Jie Lei, Thomas Wolf, and Mohit Bansal. 2021. Vimpac: Video pre-training via masked token prediction and contrastive learning. arXiv preprint arXiv:2106.11250 (2021)."},{"key":"e_1_3_2_1_41_1","volume-title":"Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. arXiv preprint arXiv:2203.12602","author":"Tong Zhan","year":"2022","unstructured":"Zhan Tong, Yibing Song, Jue Wang, and Limin Wang. 2022. Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. arXiv preprint arXiv:2203.12602 (2022)."},{"key":"e_1_3_2_1_42_1","volume-title":"VideoMAE V2: Scaling Video Masked Autoencoders with Dual Masking. arXiv preprint arXiv:2303.16727","author":"Wang Limin","year":"2023","unstructured":"Limin Wang, Bingkun Huang, Zhiyu Zhao, Zhan Tong, Yinan He, Yi Wang, Yali Wang, and Yu Qiao. 2023. VideoMAE V2: Scaling Video Masked Autoencoders with Dual Masking. arXiv preprint arXiv:2303.16727 (2023)."},{"key":"e_1_3_2_1_43_1","volume-title":"Disentangled representation learning for text-video retrieval. arXiv preprint arXiv:2203.07111","author":"Wang Qiang","year":"2022","unstructured":"Qiang Wang, Yanhao Zhang, Yun Zheng, Pan Pan, and Xian-Sheng Hua. 2022b. Disentangled representation learning for text-video retrieval. arXiv preprint arXiv:2203.07111 (2022)."},{"key":"e_1_3_2_1_44_1","volume-title":"Masked Video Distillation: Rethinking Masked Feature Modeling for Self-supervised Video Representation Learning. arXiv preprint arXiv:2212.04500","author":"Wang Rui","year":"2022","unstructured":"Rui Wang, Dongdong Chen, Zuxuan Wu, Yinpeng Chen, Xiyang Dai, Mengchen Liu, Lu Yuan, and Yu-Gang Jiang. 2022a. Masked Video Distillation: Rethinking Masked Feature Modeling for Self-supervised Video Representation Learning. arXiv preprint arXiv:2212.04500 (2022)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00504"},{"key":"e_1_3_2_1_46_1","volume-title":"Tel Aviv","author":"Wei Longhui","year":"2022","unstructured":"Longhui Wei, Lingxi Xie, Wengang Zhou, Houqiang Li, and Qi Tian. 2022. Mvp: Multimodality-guided visual pre-training. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXX. Springer, 337--353."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475515"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_49_1","first-page":"4514","article-title":"Probing inter-modality: Visual parsing with self-attention for vision-and-language pre-training","volume":"34","author":"Xue Hongwei","year":"2021","unstructured":"Hongwei Xue, Yupan Huang, Bei Liu, Houwen Peng, Jianlong Fu, Houqiang Li, and Jiebo Luo. 2021. Probing inter-modality: Visual parsing with self-attention for vision-and-language pre-training. Advances in Neural Information Processing Systems, Vol. 34 (2021), 4514--4528.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_50_1","volume-title":"CLIP-ViP: Adapting Pre-trained Image-Text Model to Video-Language Representation Alignment. arXiv preprint arXiv:2209.06430","author":"Xue Hongwei","year":"2022","unstructured":"Hongwei Xue, Yuchong Sun, Bei Liu, Jianlong Fu, Ruihua Song, Houqiang Li, and Jiebo Luo. 2022. CLIP-ViP: Adapting Pre-trained Image-Text Model to Video-Language Representation Alignment. arXiv preprint arXiv:2209.06430 (2022)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_29"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_23"},{"key":"e_1_3_2_1_53_1","volume-title":"CenterCLIP: Token Clustering for Efficient Text-Video Retrieval. arXiv preprint arXiv:2205.00823","author":"Zhao Shuai","year":"2022","unstructured":"Shuai Zhao, Linchao Zhu, Xiaohan Wang, and Yi Yang. 2022. CenterCLIP: Token Clustering for Efficient Text-Video Retrieval. arXiv preprint arXiv:2205.00823 (2022)."},{"key":"e_1_3_2_1_54_1","volume-title":"PointCLIP V2: Adapting CLIP for Powerful 3D Open-world Learning. arXiv preprint arXiv:2211.11682","author":"Zhu Xiangyang","year":"2022","unstructured":"Xiangyang Zhu, Renrui Zhang, Bowei He, Ziyao Zeng, Shanghang Zhang, and Peng Gao. 2022. PointCLIP V2: Adapting CLIP for Powerful 3D Open-world Learning. arXiv preprint arXiv:2211.11682 (2022)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611756","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611756","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:11:44Z","timestamp":1755821504000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611756"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":54,"alternative-id":["10.1145\/3581783.3611756","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611756","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}