{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,11]],"date-time":"2026-01-11T02:15:22Z","timestamp":1768097722579,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":62,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"state key development program in 14th Five-Year","award":["2021YFF0900701,2021YFF0602103, 2021YFF0602102, 2021QY1702"],"award-info":[{"award-number":["2021YFF0900701,2021YFF0602103, 2021YFF0602102, 2021QY1702"]}]},{"name":"Natural Science Foundation of China","award":["No.61801441"],"award-info":[{"award-number":["No.61801441"]}]},{"name":"the Institute for Guo Qiang, Tsinghua University","award":["No. 2019GQG0001"],"award-info":[{"award-number":["No. 2019GQG0001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680778","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"4871-4880","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Adaptively Building a Video-language Model for Video Captioning and Retrieval without Massive Video Pretraining"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0917-001X","authenticated-orcid":false,"given":"Zihao","family":"Liu","sequence":"first","affiliation":[{"name":"State Key Laboratory of Media Convergence and Communication, Communication University of China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3481-7820","authenticated-orcid":false,"given":"Xiaoyu","family":"Wu","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Media Convergence and Communication, Communication University of China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7809-1932","authenticated-orcid":false,"given":"Shengjin","family":"Wang","sequence":"additional","affiliation":[{"name":"Department of Electronic Engineering, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1854-0104","authenticated-orcid":false,"given":"Jiayao","family":"Qian","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Media Convergence and Communication, Communication University of China, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Advances in Neural Information Processing Systems","volume":"35","author":"Alayrac Jean-Baptiste","year":"2022","unstructured":"Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katherine Millican, Malcolm Reynolds, Roman Ring, Eliza Rutherford, Serkan Cabi, Tengda Han, Zhitao Gong, Sina Samangooei, Marianne Monteiro, Jacob L. Menick, Sebastian Borgeaud, Andy Brock, Aida Nematzadeh, Sahand Sharifzadeh, Mikoaj Bikowski, Ricardo Barreira, Oriol Vinyals, Andrew Zisserman, and Kar\u00e9n Simonyan. 2022. Flamingo: a Visual Language Model for Few-Shot Learning. In Advances in Neural Information Processing Systems, Vol. 35. Curran Associates, Inc., New Orleans, USA, 23716--23736."},{"key":"e_1_3_2_1_2_1","volume-title":"ViViT: A Video Vision Transformer. In 2021 IEEE\/CVF International Conference on Computer Vision (ICCV). IEEE","author":"Arnab Anurag","year":"2021","unstructured":"Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lucic, and Cordelia Schmid. 2021. ViViT: A Video Vision Transformer. In 2021 IEEE\/CVF International Conference on Computer Vision (ICCV). IEEE, Montreal, Canada, 6816--6826."},{"key":"e_1_3_2_1_3_1","volume-title":"BEiT: BERT Pre-Training of Image Transformers. In The Tenth International Conference on Learning Representations(ICLR). OpenReview.net, Virtual Event.","author":"Bao Hangbo","year":"2022","unstructured":"Hangbo Bao, Li Dong, Songhao Piao, and Furu Wei. 2022. BEiT: BERT Pre-Training of Image Transformers. In The Tenth International Conference on Learning Representations(ICLR). OpenReview.net, Virtual Event."},{"key":"e_1_3_2_1_4_1","volume-title":"Kriti Aggarwal, Subhojit Som, Songhao Piao, and Furu Wei.","author":"Bao Hangbo","year":"2022","unstructured":"Hangbo Bao, Wenhui Wang, Li Dong, Qiang Liu, Owais Khan Mohammed, Kriti Aggarwal, Subhojit Som, Songhao Piao, and Furu Wei. 2022. VLMo: Unified Vision-Language Pre-Training with Mixture-of-Modality-Experts. In Advances in Neural Information Processing Systems(NIPS), Vol. 35. Curran Associates, Inc., New Orleans, USA, 32897--32912."},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18--24","volume":"824","author":"Bertasius Gedas","year":"2021","unstructured":"Gedas Bertasius, Heng Wang, and Lorenzo Torresani. 2021. Is Space-Time Attention All You Need for Video Understanding?. In Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18--24 July 2021 (Proceedings of Machine Learning Research, Vol. 139). PMLR, Virtual Event, 813--824."},{"key":"e_1_3_2_1_6_1","volume-title":"Annual Meeting of the Association for Computational Linguistics(ACL). ACL","author":"Chen David","year":"2011","unstructured":"David Chen and William B Dolan. 2011. Collecting highly parallel data for paraphrase evaluation. In Annual Meeting of the Association for Computational Linguistics(ACL). ACL, Portland, USA, 190--200."},{"key":"e_1_3_2_1_7_1","volume-title":"VALOR: Vision-Audio-Language Omni-Perception Pretraining Model and Dataset. arXiv preprint","author":"Chen Sihan","year":"2023","unstructured":"Sihan Chen, Xingjian He, Longteng Guo, Xinxin Zhu, Weining Wang, Jinhui Tang, and Jing Liu. 2023. VALOR: Vision-Audio-Language Omni-Perception Pretraining Model and Dataset. arXiv preprint, Vol. arXiv:2304.08345 (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"VAST: A Vision-Audio-Subtitle-Text Omni-Modality Foundation Model and Dataset. In Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems","author":"Chen Sihan","year":"2023","unstructured":"Sihan Chen, Handong Li, Qunbo Wang, Zijia Zhao, Mingzhen Sun, Xinxin Zhu, and Jing Liu. 2023. VAST: A Vision-Audio-Subtitle-Text Omni-Modality Foundation Model and Dataset. In Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS. New Orleans, LA, USA."},{"key":"e_1_3_2_1_9_1","volume-title":"2023 d. VideoOFA: Two-Stage Pre-Training for Video-to-Text Generation. arXiv preprint","author":"Chen Xilun","year":"2023","unstructured":"Xilun Chen, L. Yu, Wenhan Xiong, Barlas Ouguz, Yashar Mehdad, and Wen tau Yih. 2023 d. VideoOFA: Two-Stage Pre-Training for Video-to-Text Generation. arXiv preprint, Vol. arXiv:2305.03204 (2023)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25113"},{"key":"e_1_3_2_1_11_1","volume-title":"Improving Video-Text Retrieval by Multi-Stream Corpus Alignment and Dual Softmax Loss. arXiv preprint","author":"Cheng Xing","year":"2021","unstructured":"Xing Cheng, Hezheng Lin, Xiangyu Wu, Fan Yang, and Dong Shen. 2021. Improving Video-Text Retrieval by Multi-Stream Corpus Alignment and Dual Softmax Loss. arXiv preprint, Vol. arXiv:2109.04290 (2021)."},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the Ninth Workshop on Statistical Machine Translation. ACL","author":"Michael","unstructured":"Michael J. Denkowski and Alon Lavie. 2014. Meteor Universal: Language Specific Translation Evaluation for Any Target Language. In Proceedings of the Ninth Workshop on Statistical Machine Translation. ACL, Baltimore, USA, 376--380."},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Vol. 1. Association for Computational Linguistics, Minneapolis, USA, 4171--4186."},{"key":"e_1_3_2_1_14_1","volume-title":"9th International Conference on Learning Representations, ICLR 2021","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In 9th International Conference on Learning Representations, ICLR 2021, May 3--7, 2021. OpenReview.net, Virtual Event."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1561\/0600000105"},{"key":"e_1_3_2_1_16_1","volume-title":"22nd Annual Conference of the International Speech Communication Association","author":"Gong Yuan","year":"2021","unstructured":"Yuan Gong, Yu-An Chung, and James R. Glass. 2021. AST: Audio Spectrogram Transformer. In Interspeech 2021, 22nd Annual Conference of the International Speech Communication Association 2021. ISCA, Brno, Czechia, 571--575."},{"key":"e_1_3_2_1_17_1","volume-title":"Masked Autoencoders Are Scalable Vision Learners. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE\/CVF","author":"He Kaiming","unstructured":"Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Doll'ar, and Ross B. Girshick. 2022. Masked Autoencoders Are Scalable Vision Learners. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE\/CVF, New Orleans, LA, USA, 15979--15988."},{"key":"e_1_3_2_1_18_1","volume-title":"Localizing Moments in Video with Natural Language. In IEEE International Conference on Computer Vision, ICCV","author":"Hendricks Lisa Anne","year":"2017","unstructured":"Lisa Anne Hendricks, Oliver Wang, Eli Shechtman, Josef Sivic, Trevor Darrell, and Bryan C. Russell. 2017. Localizing Moments in Video with Natural Language. In IEEE International Conference on Computer Vision, ICCV 2017. IEEE Computer Society, Venice, Italy, 5804--5813."},{"key":"e_1_3_2_1_19_1","volume-title":"LoRA: Low-Rank Adaptation of Large Language Models. In The Tenth International Conference on Learning Representations, ICLR","author":"Hu Edward J.","year":"2022","unstructured":"Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2022. LoRA: Low-Rank Adaptation of Large Language Models. In The Tenth International Conference on Learning Representations, ICLR 2022. OpenReview.net, Virtual Event."},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning(ICML)","volume":"139","author":"Jaegle Andrew","year":"2021","unstructured":"Andrew Jaegle, Felix Gimeno, Andy Brock, Oriol Vinyals, Andrew Zisserman, and Jo\u00e3o Carreira. 2021. Perceiver: General Perception with Iterative Attention. In Proceedings of the 38th International Conference on Machine Learning(ICML), Vol. 139. PMLR, Virtual Event, 4651--4664."},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021 (Proceedings of Machine Learning Research","volume":"4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision. In Proceedings of the 38th International Conference on Machine Learning, ICML 2021 (Proceedings of Machine Learning Research, Vol. 139). PMLR, Virtual Event, 4904--4916."},{"key":"e_1_3_2_1_22_1","volume-title":"Tencent Text-Video Retrieval: Hierarchical Cross-Modal Interactions with Multi-Level Representations","author":"Jiang Jie","year":"2022","unstructured":"Jie Jiang, Shaobo Min, Weijie Kong, Hongfa Wang, Zhifeng Li, and Wei Liu. 2022. Tencent Text-Video Retrieval: Hierarchical Cross-Modal Interactions with Multi-Level Representations. IEEE Access (2022), 1--1."},{"key":"e_1_3_2_1_23_1","volume-title":"International Conference on Machine Learning, ICML 2023 (Proceedings of Machine Learning Research","volume":"19742","author":"Li Junnan","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven C. H. Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In International Conference on Machine Learning, ICML 2023 (Proceedings of Machine Learning Research, Vol. 202). PMLR, Honolulu, Hawaii, USA, 19730--19742."},{"key":"e_1_3_2_1_24_1","volume-title":"BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In International Conference on Machine Learning(ICML). PMLR","author":"Li Junnan","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven C. H. Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In International Conference on Machine Learning(ICML). PMLR, Baltimore, Maryland, USA, 12888--12900."},{"key":"e_1_3_2_1_25_1","volume-title":"Advances in Neural Information Processing Systems(NIPS)","volume":"34","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before Fuse: Vision and Language Representation Learning with Momentum Distillation. In Advances in Neural Information Processing Systems(NIPS), Vol. 34. Curran Associates, Inc., Virtual Event, 9694--9705."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01826"},{"key":"e_1_3_2_1_27_1","volume-title":"Hero: Hierarchical Encoder for VideoLanguage Omni-representation Pre-training. In Conference on Empirical Methods in Natural Language Processing(EMNLP). ACL, Virtual Events","author":"Li Linjie","year":"2020","unstructured":"Linjie Li, Yen-Chun Chen, Yu Cheng, Zhe Gan, Licheng Yu, and Jingjing Liu. 2020. Hero: Hierarchical Encoder for VideoLanguage Omni-representation Pre-training. In Conference on Empirical Methods in Natural Language Processing(EMNLP). ACL, Virtual Events, 2046--2065."},{"key":"e_1_3_2_1_28_1","volume-title":"ROUGE: A Package for Automatic Evaluation of Summaries. In Annual Meeting of the Association for Computational Linguistics(ACL). ACL","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. ROUGE: A Package for Automatic Evaluation of Summaries. In Annual Meeting of the Association for Computational Linguistics(ACL). ACL, Barcelona, Spain, 74--81."},{"key":"e_1_3_2_1_29_1","volume-title":"SwinBERT: End-to-End Transformers with Sparse Attention for Video Captioning. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE\/CVF","author":"Lin Kevin","year":"2022","unstructured":"Kevin Lin, Linjie Li, Chung-Ching Lin, Faisal Ahmed, Zhe Gan, Zicheng Liu, Yumao Lu, and Lijuan Wang. 2022. SwinBERT: End-to-End Transformers with Sparse Attention for Video Captioning. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE\/CVF, New Orleans, LA, USA, 17928--17937."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00693"},{"key":"e_1_3_2_1_31_1","volume-title":"Towards Fast Adaptation of Pretrained Contrastive Models for Multi-channel Video-Language Retrieval. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR. IEEE","author":"Lin Xudong","year":"2023","unstructured":"Xudong Lin, Simran Tiwari, Shiyuan Huang, Manling Li, Mike Zheng Shou, Heng Ji, and Shih-Fu Chang. 2023. Towards Fast Adaptation of Pretrained Contrastive Models for Multi-channel Video-Language Retrieval. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR. IEEE, Vancouver, BC, Canada, 14846--14855."},{"key":"e_1_3_2_1_32_1","volume-title":"Revisiting Temporal Modeling for CLIP-Based Image-to-Video Knowledge Transferring. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR","author":"Liu Ruyang","year":"2023","unstructured":"Ruyang Liu, Jingjia Huang, Ge Li, Jiashi Feng, Xinglong Wu, and Thomas H. Li. 2023. Revisiting Temporal Modeling for CLIP-Based Image-to-Video Knowledge Transferring. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2023. IEEE, Vancouver, BC, Canada, 6555--6564."},{"key":"e_1_3_2_1_33_1","volume-title":"Electronics","volume":"11","author":"Liu Zihao","year":"2022","unstructured":"Zihao Liu, Xiaoyu Wu, and Ying Yu. 2022. Multi-Task Video Captioning with a Stepwise Multimodal Encoder. Electronics, Vol. 11, 17 (2022)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_2_1_35_1","volume-title":"X-CLIP: End-to-End Multi-grained Contrastive Learning for Video-Text Retrieval. In MM '22: The 30th ACM International Conference on Multimedia. ACM","author":"Ma Yiwei","year":"2022","unstructured":"Yiwei Ma, Guohai Xu, Xiaoshuai Sun, Ming Yan, Ji Zhang, and Rongrong Ji. 2022. X-CLIP: End-to-End Multi-grained Contrastive Learning for Video-Text Retrieval. In MM '22: The 30th ACM International Conference on Multimedia. ACM, Lisboa, Portugal, 638--647."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_1_37_1","volume-title":"Expanding Language-Image Pretrained Models for General Video Recognition. In European Conference on Computer Vision.","author":"Ni Bolin","year":"2022","unstructured":"Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, and Haibin Ling. 2022. Expanding Language-Image Pretrained Models for General Video Recognition. In European Conference on Computer Vision."},{"key":"e_1_3_2_1_38_1","volume-title":"Annual Meeting of the Association for Computational Linguistics(ACL). ACL","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a Method for Automatic Evaluation of Machine Translation. In Annual Meeting of the Association for Computational Linguistics(ACL). ACL, Philadelphia, PA, USA, 311--318."},{"key":"e_1_3_2_1_39_1","volume-title":"The Tenth International Conference on Learning Representations, ICLR","author":"Qin Zhen","year":"2022","unstructured":"Zhen Qin, Weixuan Sun, Hui Deng, Dongxu Li, Yunshen Wei, Baohong Lv, Junjie Yan, Lingpeng Kong, and Yiran Zhong. 2022. cosFormer: Rethinking Softmax In Attention. In The Tenth International Conference on Learning Representations, ICLR 2022. IEEE, Virtual Event."},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning(ICML)","volume":"139","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning(ICML), Vol. 139. PMLR, Virtual Event, 8748--8763."},{"key":"e_1_3_2_1_41_1","volume-title":"Self-Critical Sequence Training for Image Captioning. In 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 1179--1195","author":"Rennie Steven J.","year":"2016","unstructured":"Steven J. Rennie, Etienne Marcheret, Youssef Mroueh, Jerret Ross, and Vaibhava Goel. 2016. Self-Critical Sequence Training for Image Captioning. In 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 1179--1195."},{"key":"e_1_3_2_1_42_1","volume-title":"End-to-end Generative Pretraining for Multimodal Video Captioning. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Seo Paul Hongsuck","year":"2022","unstructured":"Paul Hongsuck Seo, Arsha Nagrani, Anurag Arnab, and Cordelia Schmid. 2022. End-to-end Generative Pretraining for Multimodal Video Captioning. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2022), 17938--17947."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3479207"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3232634"},{"key":"e_1_3_2_1_45_1","volume-title":"Representation Learning with Contrastive Predictive Coding. arXiv preprint","author":"van den Oord A\u00e4ron","year":"2018","unstructured":"A\u00e4ron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation Learning with Contrastive Predictive Coding. arXiv preprint, Vol. arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_46_1","volume-title":"Advances in Neural Information Processing Systems(NIPS)","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, L Ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems(NIPS), Vol. 30. Long Beach, CA, 5998--6008."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_48_1","volume-title":"GIT: A Generative Image-to-text Transformer for Vision and Language. Transactions on Machine Learning Research","author":"Wang Jianfeng","year":"2022","unstructured":"Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, and Lijuan Wang. 2022. GIT: A Generative Image-to-text Transformer for Vision and Language. Transactions on Machine Learning Research (2022)."},{"key":"e_1_3_2_1_49_1","volume-title":"International Conference on Machine Learning, ICML 2022 (Proceedings of Machine Learning Research","volume":"23340","author":"Wang Peng","year":"2022","unstructured":"Peng Wang, An Yang, Rui Men, Junyang Lin, Shuai Bai, Zhikang Li, Jianxin Ma, Chang Zhou, Jingren Zhou, and Hongxia Yang. 2022. OFA: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework. In International Conference on Machine Learning, ICML 2022 (Proceedings of Machine Learning Research, Vol. 162). PMLR, Baltimore, Maryland, USA, 23318--23340."},{"key":"e_1_3_2_1_50_1","volume-title":"Linformer: Self-Attention with Linear Complexity. arXiv preprint","author":"Wang Sinong","year":"2020","unstructured":"Sinong Wang, Belinda Z. Li, Madian Khabsa, Han Fang, and Hao Ma. 2020. Linformer: Self-Attention with Linear Complexity. arXiv preprint, Vol. arXiv:2006.04768 (2020)."},{"key":"e_1_3_2_1_51_1","volume-title":"High-Quality Multilingual Dataset for Video-and-Language Research. In 2019 IEEE\/CVF International Conference on Computer Vision (ICCV). IEEE\/CVF, 4580--4590","author":"Wang Xin Eric","year":"2019","unstructured":"Xin Eric Wang, Jiawei Wu, Junkun Chen, Lei Li, Yuan-fang Wang, and William Yang Wang. 2019. VaTeX: A Large-Scale, High-Quality Multilingual Dataset for Video-and-Language Research. In 2019 IEEE\/CVF International Conference on Computer Vision (ICCV). IEEE\/CVF, 4580--4590."},{"key":"e_1_3_2_1_52_1","volume-title":"Flowformer: Linearizing Transformers with Conservation Flows. In International Conference on Machine Learning, ICML 2022 (Proceedings of Machine Learning Research","volume":"24242","author":"Wu Haixu","year":"2022","unstructured":"Haixu Wu, Jialong Wu, Jiehui Xu, Jianmin Wang, and Mingsheng Long. 2022. Flowformer: Linearizing Transformers with Conservation Flows. In International Conference on Machine Learning, ICML 2022 (Proceedings of Machine Learning Research, Vol. 162). PMLR, Baltimore, Maryland, USA, 24226--24242."},{"key":"e_1_3_2_1_53_1","volume-title":"Image and Video. In International Conference on Machine Learning, ICML 2023 (Proceedings of Machine Learning Research","volume":"38748","author":"Xu Haiyang","year":"2023","unstructured":"Haiyang Xu, Qinghao Ye, Ming Yan, Yaya Shi, Jiabo Ye, Yuanhong Xu, Chenliang Li, Bin Bi, Qi Qian, Wei Wang, Guohai Xu, Ji Zhang, Songfang Huang, Fei Huang, and Jingren Zhou. 2023. mPLUG-2: A Modularized Multi-modal Foundation Model Across Text, Image and Video. In International Conference on Machine Learning, ICML 2023 (Proceedings of Machine Learning Research, Vol. 202). PMLR, Honolulu, USA, 38728--38748."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_55_1","volume-title":"The Eleventh International Conference on Learning Representations, ICLR","author":"Xue Hongwei","year":"2023","unstructured":"Hongwei Xue, Yuchong Sun, Bei Liu, Jianlong Fu, Ruihua Song, Houqiang Li, and Jiebo Luo. 2023. CLIP-ViP: Adapting Pre-trained Image-Text Model to Video-Language Alignment. In The Eleventh International Conference on Learning Representations, ICLR 2023. Kigali, Rwanda."},{"key":"e_1_3_2_1_56_1","volume-title":"Video-Text Modeling with Zero-Shot Transfer from Contrastive Captioners. arXiv preprint","author":"Yan Shen","year":"2022","unstructured":"Shen Yan, Tao Zhu, Zirui Wang, Yuan Cao, Mi Zhang, Soham Ghosh, Yonghui Wu, and Jiahui Yu. 2022. Video-Text Modeling with Zero-Shot Transfer from Contrastive Captioners. arXiv preprint, Vol. arXiv:2212.04979 (2022)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"e_1_3_2_1_58_1","volume-title":"PRCV 2022 (Lecture Notes in Computer Science","volume":"381","author":"Yang Bang","year":"2022","unstructured":"Bang Yang, Tong Zhang, and Yuexian Zou. 2022. CLIP Meets Video Captioning: Concept-Aware Representation Learning Does Matter. In Pattern Recognition and Computer Vision - 5th Chinese Conference, PRCV 2022 (Lecture Notes in Computer Science, Vol. 13534). Springer, Shenzhen, China, 368--381."},{"key":"e_1_3_2_1_59_1","volume-title":"AIM: Adapting Image Models for Efficient Video Action Recognition. In The Eleventh International Conference on Learning Representations, ICLR 2023","author":"Yang Taojiannan","year":"2023","unstructured":"Taojiannan Yang, Yi Zhu, Yusheng Xie, Aston Zhang, Chen Chen, and Mu Li. 2023. AIM: Adapting Image Models for Efficient Video Action Recognition. In The Eleventh International Conference on Learning Representations, ICLR 2023, Kigali, Rwanda, May 1--5, 2023."},{"key":"e_1_3_2_1_60_1","volume-title":"CoCa: Contrastive Captioners are Image-Text Foundation Models. Transactions on Machine Learning Research","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Zirui Wang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022. CoCa: Contrastive Captioners are Image-Text Foundation Models. Transactions on Machine Learning Research (2022)."},{"key":"e_1_3_2_1_61_1","volume-title":"Open-book Video Captioning with Retrieve-Copy-Generate Network. In 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 9832--9841","author":"Zhang Ziqi","year":"2021","unstructured":"Ziqi Zhang, Zhongang Qi, C. Yuan, Ying Shan, Bing Li, Ying Deng, and Weiming Hu. 2021. Open-book Video Captioning with Retrieve-Copy-Generate Network. In 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 9832--9841."},{"key":"e_1_3_2_1_62_1","volume-title":"Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting. In Thirty-Fifth AAAI Conference on Artificial Intelligence, AAAI","author":"Zhou Haoyi","year":"2021","unstructured":"Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. 2021. Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting. In Thirty-Fifth AAAI Conference on Artificial Intelligence, AAAI 2021. AAAI Press, Virtual Event, 11106--11115."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680778","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680778","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:42Z","timestamp":1750294662000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680778"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":62,"alternative-id":["10.1145\/3664647.3680778","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680778","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}