{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:10:06Z","timestamp":1755821406058,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612537","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"4460-4470","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Learning Semantics-Grounded Vocabulary Representation for Video-Text Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0465-6712","authenticated-orcid":false,"given":"Yaya","family":"Shi","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0439-2692","authenticated-orcid":false,"given":"Haowei","family":"Liu","sequence":"additional","affiliation":[{"name":"Institute of Automation, CAS &amp; University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9442-5912","authenticated-orcid":false,"given":"Haiyang","family":"Xu","sequence":"additional","affiliation":[{"name":"DAMO Academy, Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1980-9379","authenticated-orcid":false,"given":"Zongyang","family":"Ma","sequence":"additional","affiliation":[{"name":"Institute of Automation, CAS &amp; University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7977-5540","authenticated-orcid":false,"given":"Qinghao","family":"Ye","sequence":"additional","affiliation":[{"name":"DAMO Academy, Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8839-4996","authenticated-orcid":false,"given":"Anwen","family":"Hu","sequence":"additional","affiliation":[{"name":"DAMO Academy, Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4959-8878","authenticated-orcid":false,"given":"Ming","family":"Yan","sequence":"additional","affiliation":[{"name":"DAMO Academy, Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3835-7975","authenticated-orcid":false,"given":"Ji","family":"Zhang","sequence":"additional","affiliation":[{"name":"DAMO Academy, Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3709-5053","authenticated-orcid":false,"given":"Fei","family":"Huang","sequence":"additional","affiliation":[{"name":"DAMO Academy, Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2219-4961","authenticated-orcid":false,"given":"Chunfeng","family":"Yuan","sequence":"additional","affiliation":[{"name":"Institute of Automation, CAS &amp; University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6114-1411","authenticated-orcid":false,"given":"Bing","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Automation, CAS &amp; University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9237-8825","authenticated-orcid":false,"given":"Weiming","family":"Hu","sequence":"additional","affiliation":[{"name":"Institute of Automation, CAS &amp; University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2510-8993","authenticated-orcid":false,"given":"Zheng-Jun","family":"Zha","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Vatt: Transformers for multimodal self-supervised learning from raw video, audio and text. arXiv preprint arXiv:2104.11178","author":"Akbari Hassan","year":"2021","unstructured":"Hassan Akbari, Linagzhe Yuan, Rui Qian, Wei-Hong Chuang, Shih-Fu Chang, Yin Cui, and Boqing Gong. 2021. Vatt: Transformers for multimodal self-supervised learning from raw video, audio and text. arXiv preprint arXiv:2104.11178 (2021)."},{"key":"e_1_3_2_1_2_1","unstructured":"Humam Alwassel Dhruv Mahajan Bruno Korbar Lorenzo Torresani Bernard Ghanem and Du Tran. 2020. Self-Supervised Learning by Cross-Modal Audio-Video Clustering. In NeurIPS."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Lisa Anne Hendricks Oliver Wang Eli Shechtman Josef Sivic Trevor Darrell and Bryan Russell. 2017. Localizing moments in video with natural language. In ICCV. 5803--5812.","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_4_1","unstructured":"Jinbin Bai Chunhui Liu Feiyue Ni Haofan Wang Mengying Hu Xiaofeng Guo and Lele Cheng. 2022. LaT: Latent Translation with Cycle-Consistency for Video-Text Retrieval. (2022). arxiv: 2207.04858 [cs.CV]"},{"key":"e_1_3_2_1_5_1","volume-title":"SparTerm: Learning Term-based Sparse Representation for Fast Text Retrieval. ArXiv","author":"Bai Yang","year":"2020","unstructured":"Yang Bai, Xiaoguang Li, Gang Wang, Chaoliang Zhang, Lifeng Shang, Jun Xu, Zhaowei Wang, Fangshan Wang, and Qun Liu. 2020. SparTerm: Learning Term-based Sparse Representation for Fast Text Retrieval. ArXiv, Vol. abs\/2010.00768 (2020)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Max Bain Arsha Nagrani G\u00fcl Varol and Andrew Zisserman. 2021. Frozen in time: A joint video and image encoder for end-to-end retrieval. In ICCV. 1728--1738.","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"e_1_3_2_1_7_1","first-page":"4","article-title":"Is space-time attention all you need for video understanding?","volume":"2","author":"Bertasius Gedas","year":"2021","unstructured":"Gedas Bertasius, Heng Wang, and Lorenzo Torresani. 2021. Is space-time attention all you need for video understanding?. In ICML, Vol. 2. 4.","journal-title":"ICML"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2014.2372616"},{"volume-title":"End-to-end object detection with transformers","author":"Carion Nicolas","key":"e_1_3_2_1_9_1","unstructured":"Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. 2020. End-to-end object detection with transformers. In ECCV. Springer, 213--229."},{"key":"e_1_3_2_1_10_1","unstructured":"David Chen and William B Dolan. 2011. Collecting highly parallel data for paraphrase evaluation. In ACL. 190--200."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022-1369-5"},{"key":"e_1_3_2_1_12_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_13_1","volume-title":"SPLADE v2: Sparse Lexical and Expansion Model for Information Retrieval. ArXiv","author":"Formal Thibault","year":"2021","unstructured":"Thibault Formal, C. Lassance, Benjamin Piwowarski, and St\u00e9phane Clinchant. 2021a. SPLADE v2: Sparse Lexical and Expansion Model for Information Retrieval. ArXiv, Vol. abs\/2109.10086 (2021)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3463098"},{"key":"e_1_3_2_1_15_1","unstructured":"Yuying Ge Yixiao Ge Xihui Liu Dian Li Ying Shan Xiaohu Qie and Ping Luo. 2022a. Bridging Video-Text Retrieval With Multiple Choice Questions. In CVPR. 16167--16176."},{"key":"e_1_3_2_1_16_1","volume-title":"MILES: Visual BERT Pre-training with Injected Language Semantics for Video-text Retrieval. In ECCV.","author":"Ge Yuying","year":"2022","unstructured":"Yuying Ge, Yixiao Ge, Xihui Liu, Alex Wang, Jianping Wu, Ying Shan, Xiaohu Qie, and Ping Luo. 2022b. MILES: Visual BERT Pre-training with Injected Language Semantics for Video-text Retrieval. In ECCV."},{"key":"e_1_3_2_1_17_1","volume-title":"Self-supervised Co-Training for Video Representation Learning. NeurIPS","author":"Han Tengda","year":"2020","unstructured":"Tengda Han, Weidi Xie, and Andrew Zisserman. 2020. Self-supervised Co-Training for Video Representation Learning. NeurIPS (2020), 5679--5690."},{"key":"e_1_3_2_1_18_1","volume-title":"NeurIPS","volume":"34","author":"Huo Yuqi","year":"2021","unstructured":"Yuqi Huo, Mingyu Ding, Haoyu Lu, Nanyi Fei, Zhiwu Lu, Ji-Rong Wen, and Ping Luo. 2021. Compressed Video Contrastive Learning. NeurIPS, Vol. 34 (2021)."},{"key":"e_1_3_2_1_19_1","first-page":"30291","article-title":"Expectation-Maximization Contrastive Learning for Compact Video-and-Language Representations","volume":"35","author":"Jin Peng","year":"2022","unstructured":"Peng Jin, JinFa Huang, Fenglin Liu, Xian Wu, Shen Ge, Guoli Song, David A. Clifton, and Jie Chen. 2022. Expectation-Maximization Contrastive Learning for Compact Video-and-Language Representations. In NeurIPS, Vol. 35. 30291--30306.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Hildegard Kuehne Hueihan Jhuang Est'ibaliz Garrote Tomaso Poggio and Thomas Serre. 2011. HMDB: a large video database for human motion recognition. In ICCV. 2556--2563.","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"e_1_3_2_1_21_1","volume-title":"The Hungarian method for the assignment problem. Naval research logistics quarterly","author":"Kuhn Harold W","year":"1955","unstructured":"Harold W Kuhn. 1955. The Hungarian method for the assignment problem. Naval research logistics quarterly, Vol. 2, 1--2 (1955), 83--97."},{"key":"e_1_3_2_1_22_1","volume-title":"An Efficiency Study for SPLADE Models. In SIGIR '22: The 45th International ACM SIGIR Conference on Research and Development in Information Retrieval","author":"Lassance Carlos","year":"2022","unstructured":"Carlos Lassance and St\u00e9phane Clinchant. 2022. An Efficiency Study for SPLADE Models. In SIGIR '22: The 45th International ACM SIGIR Conference on Research and Development in Information Retrieval, Madrid, Spain, July 11 - 15, 2022, Enrique Amig\u00f3, Pablo Castells, Julio Gonzalo, Ben Carterette, J. Shane Culpepper, and Gabriella Kazai (Eds.). ACM, 2220--2226."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Jie Lei Linjie Li Luowei Zhou Zhe Gan Tamara L Berg Mohit Bansal and Jingjing Liu. 2021. Less is more: Clipbert for video-and-language learning via sparse sampling. In CVPR. 7331--7341.","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Chenliang Li Haiyang Xu Junfeng Tian Wei Wang Ming Yan Bin Bi Jiabo Ye Hehong Chen Guohai Xu Zheng Cao et al. 2022b. mPLUG: Effective and Efficient Vision-Language Learning by Cross-modal Skip-connections. arXiv preprint arXiv:2205.12005 (2022).","DOI":"10.18653\/v1\/2022.emnlp-main.488"},{"key":"e_1_3_2_1_25_1","volume-title":"Juan Carlos Niebles, and Steven CH Hoi","author":"Li Dongxu","year":"2022","unstructured":"Dongxu Li, Junnan Li, Hongdong Li, Juan Carlos Niebles, and Steven CH Hoi. 2022a. Align and Prompt: Video-and-Language Pre-training with Entity Prompts. In CVPR. 4953--4963."},{"key":"e_1_3_2_1_26_1","first-page":"17612","article-title":"Mind the gap: Understanding the modality gap in multi-modal contrastive representation learning","volume":"35","author":"Liang Victor Weixin","year":"2022","unstructured":"Victor Weixin Liang, Yuhui Zhang, Yongchan Kwon, Serena Yeung, and James Y Zou. 2022. Mind the gap: Understanding the modality gap in multi-modal contrastive representation learning. NeurIPS, Vol. 35 (2022), 17612--17625.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Kevin Lin Linjie Li Chung-Ching Lin Faisal Ahmed Zhe Gan Zicheng Liu Yumao Lu and Lijuan Wang. 2022. SwinBERT: End-to-end transformers with sparse attention for video captioning. In CVPR. 17949--17958.","DOI":"10.1109\/CVPR52688.2022.01742"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Xuejing Liu Liang Li Shuhui Wang Zheng-Jun Zha Dechao Meng and Qingming Huang. 2019. Adaptive Reconstruction Network for Weakly Supervised Referring Expression Grounding. In ICCV. 2611--2620.","DOI":"10.1109\/ICCV.2019.00270"},{"key":"e_1_3_2_1_29_1","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In ICLR."},{"key":"e_1_3_2_1_30_1","volume-title":"X-CLIP: End-to-End Multi-grained Contrastive Learning for Video-Text Retrieval. arXiv preprint arXiv:2207.07285","author":"Ma Yiwei","year":"2022","unstructured":"Yiwei Ma, Guohai Xu, Xiaoshuai Sun, Ming Yan, Ji Zhang, and Rongrong Ji. 2022. X-CLIP: End-to-End Multi-grained Contrastive Learning for Video-Text Retrieval. arXiv preprint arXiv:2207.07285 (2022)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Antoine Miech Jean-Baptiste Alayrac Lucas Smaira Ivan Laptev Josef Sivic and Andrew Zisserman. 2020. End-to-end learning of visual representations from uncurated instructional videos. In CVPR. 9879--9889.","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Antoine Miech Dimitri Zhukov Jean-Baptiste Alayrac Makarand Tapaswi Ivan Laptev and Josef Sivic. 2019. Howto100m: Learning a text-video embedding by watching hundred million narrated video clips. In ICCV. 2630--2640.","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_1_33_1","volume-title":"Ning Xu, Pradeep Ravikumar, and Barnab\u00e1s P\u00f3czos.","author":"Paria Biswajit","year":"2020","unstructured":"Biswajit Paria, Chih-Kuan Yeh, Ian En-Hsu Yen, Ning Xu, Pradeep Ravikumar, and Barnab\u00e1s P\u00f3czos. 2020. Minimizing FLOPs to Learn Efficient Sparse Representations. In ICLR."},{"key":"e_1_3_2_1_34_1","unstructured":"Mandela Patrick Po-Yao Huang Yuki Asano Florian Metze Alexander G Hauptmann Joao F Henriques and Andrea Vedaldi. 2020. Support-set bottlenecks for video-text representation learning. In ICLR."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"AJ Piergiovanni Anelia Angelova and Michael S Ryoo. 2020. Evolving losses for unsupervised video representation learning. In CVPR. 133--142.","DOI":"10.1109\/CVPR42600.2020.00021"},{"key":"e_1_3_2_1_36_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_37_1","volume-title":"Imagenet-21k pretraining for the masses. arXiv preprint arXiv:2104.10972","author":"Ridnik Tal","year":"2021","unstructured":"Tal Ridnik, Emanuel Ben-Baruch, Asaf Noy, and Lihi Zelnik-Manor. 2021. Imagenet-21k pretraining for the masses. arXiv preprint arXiv:2104.10972 (2021)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Anna Rohrbach Marcus Rohrbach Niket Tandon and Bernt Schiele. 2015. A dataset for movie description. In CVPR. 3202--3212.","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"e_1_3_2_1_39_1","volume-title":"Avlnet: Learning audio-visual language representations from instructional videos. arXiv preprint arXiv:2006.09199","author":"Rouditchenko Andrew","year":"2020","unstructured":"Andrew Rouditchenko, Angie Boggust, David Harwath, Brian Chen, Dhiraj Joshi, Samuel Thomas, Kartik Audhkhasi, Hilde Kuehne, Rameswar Panda, Rogerio Feris, et al. 2020. Avlnet: Learning audio-visual language representations from instructional videos. arXiv preprint arXiv:2006.09199 (2020)."},{"key":"e_1_3_2_1_40_1","volume-title":"Conceptual Captions: A Cleaned, Hypernymed, Image Alt-text Dataset For Automatic Image Captioning. In ACL.","author":"Sharma Piyush","year":"2018","unstructured":"Piyush Sharma, Nan Ding, Sebastian Goodman, and Radu Soricut. 2018. Conceptual Captions: A Cleaned, Hypernymed, Image Alt-text Dataset For Automatic Image Captioning. In ACL."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Yaya Shi Xu Yang Haiyang Xu Chunfeng Yuan Bing Li Weiming Hu and Zheng-Jun Zha. 2022. EMScore: Evaluating Video Captioning via Coarse-Grained and Fine-Grained Embedding Matching. In CVPR. 17908--17917.","DOI":"10.1109\/CVPR52688.2022.01740"},{"key":"e_1_3_2_1_42_1","unstructured":"Junhyuk So Changdae Oh Yongtaek Lim Hoyoon Byun Minchul Shin and Kyungwoo Song. 2022. Geodesic Multi-Modal Mixup for Robust Fine-Tuning. (2022). arxiv: 2203.03897 [cs.CV]"},{"key":"e_1_3_2_1_43_1","volume-title":"Amir Roshan Zamir, and Mubarak Shah","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Ganchao Tan Daqing Liu Meng Wang and Zheng-Jun Zha. 2020. Learning to Discretely Compose Reasoning Module Networks for Video Captioning. In IJCAI Christian Bessiere (Ed.). 745--752.","DOI":"10.24963\/ijcai.2020\/104"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Alex Jinpeng Wang Yixiao Ge Guanyu Cai Rui Yan Xudong Lin Ying Shan Xiaohu Qie and Mike Zheng Shou. 2022. Object-aware Video-language Pre-training for Retrieval. In CVPR. 3303--3312.","DOI":"10.1109\/CVPR52688.2022.00331"},{"key":"e_1_3_2_1_46_1","volume-title":"Vatex: A large-scale, high-quality multilingual dataset for video-and-language research. In ICCV. 4581--4591.","author":"Wang Xin","year":"2019","unstructured":"Xin Wang, Jiawei Wu, Junkun Chen, Lei Li, Yuan-Fang Wang, and William Yang Wang. 2019. Vatex: A large-scale, high-quality multilingual dataset for video-and-language research. In ICCV. 4581--4591."},{"key":"e_1_3_2_1_47_1","volume-title":"Videoclip: Contrastive pre-training for zero-shot video-text understanding. arXiv preprint arXiv:2109.14084","author":"Xu Hu","year":"2021","unstructured":"Hu Xu, Gargi Ghosh, Po-Yao Huang, Dmytro Okhonko, Armen Aghajanyan, Florian Metze, Luke Zettlemoyer, and Christoph Feichtenhofer. 2021a. Videoclip: Contrastive pre-training for zero-shot video-text understanding. arXiv preprint arXiv:2109.14084 (2021)."},{"key":"e_1_3_2_1_48_1","unstructured":"Haiyang Xu Ming Yan Chenliang Li Bin Bi Songfang Huang Wenming Xiao and Fei Huang. 2021b. E2E-VLP: End-to-End Vision-Language Pre-training Enhanced by Visual Learning. In ACL. 503--513."},{"key":"e_1_3_2_1_49_1","volume-title":"mPLUG-2: A Modularized Multi-modal Foundation Model Across Text, Image and Video. ArXiv","author":"Xu Haiyang","year":"2023","unstructured":"Haiyang Xu, Qinghao Ye, Mingshi Yan, Yaya Shi, Jiabo Ye, Yuanhong Xu, Chenliang Li, Bin Bi, Qiuchen Qian, Wei Wang, Guohai Xu, Ji Zhang, Songfang Huang, Feiran Huang, and Jingren Zhou. 2023. mPLUG-2: A Modularized Multi-modal Foundation Model Across Text, Image and Video. ArXiv, Vol. abs\/2302.00402 (2023)."},{"key":"e_1_3_2_1_50_1","unstructured":"Jun Xu Tao Mei Ting Yao and Yong Rui. [n.d.]. Msr-vtt: A large video description dataset for bridging video and language. In CVPR."},{"key":"e_1_3_2_1_51_1","volume-title":"Making History Matter: History-Advantage Sequence Training for Visual Dialog. ICCV","author":"Yang Tianhao","year":"2019","unstructured":"Tianhao Yang, Zhengjun Zha, and Hanwang Zhang. 2019. Making History Matter: History-Advantage Sequence Training for Visual Dialog. ICCV (2019), 2561--2569."},{"key":"e_1_3_2_1_52_1","volume-title":"mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. ArXiv","author":"Ye Qinghao","year":"2023","unstructured":"Qinghao Ye, Haiyang Xu, Guohai Xu, Jiabo Ye, Ming Yan, Yi Zhou, Junyan Wang, Anwen Hu, Pengcheng Shi, Yaya Shi, Chenliang Li, Yuanhong Xu, Hehong Chen, Junfeng Tian, Qiang Qi, Ji Zhang, and Feiyan Huang. 2023. mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. ArXiv, Vol. abs\/2304.14178 (2023)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612537","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612537","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:57:08Z","timestamp":1755820628000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612537"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":52,"alternative-id":["10.1145\/3581783.3612537","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612537","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}