{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,3]],"date-time":"2026-07-03T00:43:14Z","timestamp":1783039394206,"version":"3.54.6"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,7,10]],"date-time":"2024-07-10T00:00:00Z","timestamp":1720569600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-sa\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100006374","name":"Ant Group","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,7,10]]},"DOI":"10.1145\/3626772.3657833","type":"proceedings-article","created":{"date-parts":[[2024,7,11]],"date-time":"2024-07-11T12:40:05Z","timestamp":1720701605000},"page":"2156-2166","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["M\n            <sup>2<\/sup>\n            -RAAP: A Multi-Modal Recipe for Advancing Adaptation-based Pre-training towards Effective and Efficient Zero-shot Video-text Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0245-9064","authenticated-orcid":false,"given":"Xingning","family":"Dong","sequence":"first","affiliation":[{"name":"Ant Group, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0908-9200","authenticated-orcid":false,"given":"Zipeng","family":"Feng","sequence":"additional","affiliation":[{"name":"Ant Group, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0284-6256","authenticated-orcid":false,"given":"Chunluan","family":"Zhou","sequence":"additional","affiliation":[{"name":"Ant Group, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9752-799X","authenticated-orcid":false,"given":"Xuzheng","family":"Yu","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1691-6817","authenticated-orcid":false,"given":"Ming","family":"Yang","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, CA, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0521-9664","authenticated-orcid":false,"given":"Qingpei","family":"Guo","sequence":"additional","affiliation":[{"name":"Ant Group, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,7,11]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"e_1_3_2_1_4_1","volume-title":"Altclip: Altering the language encoder in clip for extended language capabilities. arXiv preprint arXiv:2211.06679","author":"Chen Zhongzhi","year":"2022","unstructured":"Zhongzhi Chen, Guang Liu, Bo-Wen Zhang, Fulong Ye, Qinghong Yang, and Ledell Wu. 2022. Altclip: Altering the language encoder in clip for extended language capabilities. arXiv preprint arXiv:2211.06679 (2022)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01034"},{"key":"e_1_3_2_1_6_1","volume-title":"Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:2109.04290","author":"Cheng Xing","year":"2021","unstructured":"Xing Cheng, Hezheng Lin, Xiangyu Wu, Fan Yang, and Dong Shen. 2021. Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:2109.04290 (2021)."},{"key":"e_1_3_2_1_7_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01882"},{"key":"e_1_3_2_1_9_1","volume-title":"SNP-s3: Shared network pre-training and significant semantic strengthening for various video-text tasks","author":"Dong Xingning","year":"2023","unstructured":"Xingning Dong, Qingpei Guo, Tian Gan, Qing Wang, Jianlong Wu, Xiangyuan Ren, Yuan Cheng, and Wei Chu. 2023. SNP-s3: Shared network pre-training and significant semantic strengthening for various video-text tasks. IEEE Transactions on Circuits and Systems for Video Technology (2023)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01763"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.26"},{"key":"e_1_3_2_1_12_1","volume-title":"Clip2video: Mastering video-text retrieval via image clip. arXiv preprint arXiv:2106.11097","author":"Fang Han","year":"2021","unstructured":"Han Fang, Pengfei Xiong, Luhui Xu, and Yu Chen. 2021. Clip2video: Mastering video-text retrieval via image clip. arXiv preprint arXiv:2106.11097 (2021)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01423"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612120"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01569"},{"key":"e_1_3_2_1_16_1","volume-title":"M2-Encoder: Advancing bilingual image-text understanding by large-scale efficient pretraining. arXiv preprint arXiv:2401.15896","author":"Guo Qingpei","year":"2024","unstructured":"Qingpei Guo, Furong Xu, Hanxiao Zhang, Wang Ren, Ziping Ma, Lin Ju, Jian Wang, Jingdong Chen, and Ming Yang. 2024. M2-Encoder: Advancing bilingual image-text understanding by large-scale efficient pretraining. arXiv preprint arXiv:2401.15896 (2024)."},{"key":"e_1_3_2_1_17_1","volume-title":"Pixel-bert: Aligning image pixels with text by deep multi-modal transformers. arXiv preprint arXiv:2004.00849","author":"Huang Zhicheng","year":"2020","unstructured":"Zhicheng Huang, Zhaoyang Zeng, Bei Liu, Dongmei Fu, and Jianlong Fu. 2020. Pixel-bert: Aligning image pixels with text by deep multi-modal transformers. arXiv preprint arXiv:2004.00849 (2020)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612006"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00300"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_1_21_1","volume-title":"Revealing single frame bias for video-and-language learning. arXiv preprint arXiv:2206.03428","author":"Lei Jie","year":"2022","unstructured":"Jie Lei, Tamara L Berg, and Mohit Bansal. 2022. Revealing single frame bias for video-and-language learning. arXiv preprint arXiv:2206.03428 (2022)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"e_1_3_2_1_23_1","volume-title":"2023 a. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023 a. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"2023 b. Unmasked teacher: Towards training-efficient video foundation models. arXiv preprint arXiv:2303.16058","author":"Li Kunchang","year":"2023","unstructured":"Kunchang Li, Yali Wang, Yizhuo Li, Yi Wang, Yinan He, Limin Wang, and Yu Qiao. 2023 b. Unmasked teacher: Towards training-efficient video foundation models. arXiv preprint arXiv:2303.16058 (2023)."},{"key":"e_1_3_2_1_25_1","volume-title":"Hero: Hierarchical encoder for video language omni-representation pre-training. arXiv preprint arXiv:2005.00200","author":"Li Linjie","year":"2020","unstructured":"Linjie Li, Yen-Chun Chen, Yu Cheng, Zhe Gan, Licheng Yu, and Jingjing Liu. 2020. Hero: Hierarchical encoder for video language omni-representation pre-training. arXiv preprint arXiv:2005.00200 (2020)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_27_1","volume-title":"2023 a. Mug-STAN: Adapting image-language pretrained models for general video understanding. arXiv preprint arXiv:2311.15075","author":"Liu Ruyang","year":"2023","unstructured":"Ruyang Liu, Jingjia Huang, Wei Gao, Thomas H Li, and Ge Li. 2023 a. Mug-STAN: Adapting image-language pretrained models for general video understanding. arXiv preprint arXiv:2311.15075 (2023)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00634"},{"key":"e_1_3_2_1_29_1","volume-title":"2023 c. One for all: Video conversation is feasible without video instruction tuning. arXiv preprint arXiv:2309.15785","author":"Liu Ruyang","year":"2023","unstructured":"Ruyang Liu, Chen Li, Yixiao Ge, Ying Shan, Thomas H Li, and Ge Li. 2023 c. One for all: Video conversation is feasible without video instruction tuning. arXiv preprint arXiv:2309.15785 (2023)."},{"key":"e_1_3_2_1_30_1","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in Neural Information Processing Systems , Vol. 32 (2019)."},{"key":"e_1_3_2_1_31_1","volume-title":"Univl: A unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353","author":"Luo Huaishao","year":"2020","unstructured":"Huaishao Luo, Lei Ji, Botian Shi, Haoyang Huang, Nan Duan, Tianrui Li, Jason Li, Taroon Bharti, and Ming Zhou. 2020. Univl: A unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353 (2020)."},{"key":"e_1_3_2_1_32_1","volume-title":"Clip4clip: An empirical study of clip for end to end video clip retrieval. arXiv preprint arXiv:2104.08860","author":"Luo Huaishao","year":"2021","unstructured":"Huaishao Luo, Lei Ji, Ming Zhong, Yang Chen, Wen Lei, Nan Duan, and Tianrui Li. 2021. Clip4clip: An empirical study of clip for end to end video clip retrieval. arXiv preprint arXiv:2104.08860 (2021)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548180"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-68790-8_27"},{"key":"e_1_3_2_1_36_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0987-1"},{"key":"e_1_3_2_1_38_1","volume-title":"a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108","author":"Sanh Victor","year":"2019","unstructured":"Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf. 2019. DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)."},{"key":"e_1_3_2_1_39_1","first-page":"25278","article-title":"Laion-5b: An open large-scale dataset for training next generation image-text models","volume":"35","author":"Schuhmann Christoph","year":"2022","unstructured":"Christoph Schuhmann, Romain Beaumont, Richard Vencu, Cade Gordon, Ross Wightman, Mehdi Cherti, Theo Coombes, Aarush Katta, Clayton Mullis, Mitchell Wortsman, et al. 2022. Laion-5b: An open large-scale dataset for training next generation image-text models. Advances in Neural Information Processing Systems , Vol. 35 (2022), 25278--25294.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01265"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11036-020-01730-0"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3597434"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612152"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00468"},{"key":"e_1_3_2_1_47_1","volume-title":"2023 a. Internvid: A large-scale video-text dataset for multimodal understanding and generation. arXiv preprint arXiv:2307.06942","author":"Wang Yi","year":"2023","unstructured":"Yi Wang, Yinan He, Yizhuo Li, Kunchang Li, Jiashuo Yu, Xin Ma, Xinhao Li, Guo Chen, Xinyuan Chen, Yaohui Wang, et al. 2023 a. Internvid: A large-scale video-text dataset for multimodal understanding and generation. arXiv preprint arXiv:2307.06942 (2023)."},{"key":"e_1_3_2_1_48_1","volume-title":"VLM: Task-agnostic video-language model pre-training for video understanding. arXiv preprint arXiv:2105.09996","author":"Xu Hu","year":"2021","unstructured":"Hu Xu, Gargi Ghosh, Po-Yao Huang, Prahal Arora, Masoumeh Aminzadeh, Christoph Feichtenhofer, Florian Metze, and Luke Zettlemoyer. 2021. VLM: Task-agnostic video-language model pre-training for video understanding. arXiv preprint arXiv:2105.09996 (2021)."},{"key":"e_1_3_2_1_49_1","volume-title":"2023 a. Youku-mPLUG: A 10 million large-scale chinese video-language dataset for pre-training and benchmarks. arXiv preprint arXiv:2306.04362","author":"Xu Haiyang","year":"2023","unstructured":"Haiyang Xu, Qinghao Ye, Xuan Wu, Ming Yan, Yuan Miao, Jiabo Ye, Guohai Xu, Anwen Hu, Yaya Shi, Guangwei Xu, et al. 2023 a. Youku-mPLUG: A 10 million large-scale chinese video-language dataset for pre-training and benchmarks. arXiv preprint arXiv:2306.04362 (2023)."},{"key":"e_1_3_2_1_50_1","volume-title":"2023 b. mplug-2: A modularized multi-modal foundation model across text, image and video. arXiv preprint arXiv:2302.00402","author":"Xu Haiyang","year":"2023","unstructured":"Haiyang Xu, Qinghao Ye, Ming Yan, Yaya Shi, Jiabo Ye, Yuanhong Xu, Chenliang Li, Bin Bi, Qi Qian, Wei Wang, et al. 2023 b. mplug-2: A modularized multi-modal foundation model across text, image and video. arXiv preprint arXiv:2302.00402 (2023)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_52_1","volume-title":"Clip-vip: Adapting pre-trained image-text model to video-language representation alignment. arXiv preprint arXiv:2209.06430","author":"Xue Hongwei","year":"2022","unstructured":"Hongwei Xue, Yuchong Sun, Bei Liu, Jianlong Fu, Ruihua Song, Houqiang Li, and Jiebo Luo. 2022. Clip-vip: Adapting pre-trained image-text model to video-language representation alignment. arXiv preprint arXiv:2209.06430 (2022)."},{"key":"e_1_3_2_1_53_1","volume-title":"VideoCoCa: Video-text modeling with zero-shot transfer from contrastive captioners. arXiv preprint arXiv:2212.04979","author":"Yan Shen","year":"2022","unstructured":"Shen Yan, Tao Zhu, Zirui Wang, Yuan Cao, Mi Zhang, Soham Ghosh, Yonghui Wu, and Jiahui Yu. 2022. VideoCoCa: Video-text modeling with zero-shot transfer from contrastive captioners. arXiv preprint arXiv:2212.04979 (2022)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00171"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01136"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01413"},{"key":"e_1_3_2_1_57_1","volume-title":"Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Zirui Wang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022. Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)."},{"key":"e_1_3_2_1_58_1","unstructured":"Aohan Zeng Xiao Liu Zhengxiao Du Zihan Wang Hanyu Lai Ming Ding Zhuoyi Yang Yifan Xu Wendi Zheng Xiao Xia et al. 2022. Glm-130b: An open bilingual pre-trained model. arXiv preprint arXiv:2210.02414 (2022)."},{"key":"e_1_3_2_1_59_1","unstructured":"Bin Zhu Bin Lin Munan Ning Yang Yan Jiaxi Cui HongFa Wang Yatian Pang Wenhao Jiang Junwu Zhang Zongwei Li et al. 2023. LanguageBind: Extending video-language pretraining to n-modality by language-based semantic alignment. arXiv preprint arXiv:2310.01852 (2023)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00877"}],"event":{"name":"SIGIR 2024: The 47th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Washington DC USA","acronym":"SIGIR 2024","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3626772.3657833","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3626772.3657833","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T05:40:35Z","timestamp":1755841235000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3626772.3657833"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,10]]},"references-count":60,"alternative-id":["10.1145\/3626772.3657833","10.1145\/3626772"],"URL":"https:\/\/doi.org\/10.1145\/3626772.3657833","relation":{},"subject":[],"published":{"date-parts":[[2024,7,10]]},"assertion":[{"value":"2024-07-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}