{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T01:18:10Z","timestamp":1769044690947,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":72,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612132","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"3975-3984","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Graph-Based Video-Language Learning with Multi-Grained Audio-Visual Alignment"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-6733-5879","authenticated-orcid":false,"given":"Chenyang","family":"Lyu","sequence":"first","affiliation":[{"name":"Dublin City University, Dublin, Ireland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8449-0489","authenticated-orcid":false,"given":"Wenxi","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0143-6220","authenticated-orcid":false,"given":"Tianbo","family":"Ji","sequence":"additional","affiliation":[{"name":"Nantong University, Nantong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9062-6183","authenticated-orcid":false,"given":"Longyue","family":"Wang","sequence":"additional","affiliation":[{"name":"Tencent AI Lab, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7778-8743","authenticated-orcid":false,"given":"Liting","family":"Zhou","sequence":"additional","affiliation":[{"name":"Dublin City University, Dublin, Ireland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2903-3968","authenticated-orcid":false,"given":"Cathal","family":"Gurrin","sequence":"additional","affiliation":[{"name":"Dublin City University, Dublin, Ireland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0667-7349","authenticated-orcid":false,"given":"Linyi","family":"Yang","sequence":"additional","affiliation":[{"name":"Westlake University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0294-6620","authenticated-orcid":false,"given":"Yi","family":"Yu","sequence":"additional","affiliation":[{"name":"National Institute of Informatics, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6741-4855","authenticated-orcid":false,"given":"Yvette","family":"Graham","sequence":"additional","affiliation":[{"name":"Trinity College Dublin, Dublin, Ireland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7789-4853","authenticated-orcid":false,"given":"Jennifer","family":"Foster","sequence":"additional","affiliation":[{"name":"Dublin City University, Dublin, Ireland"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Huda Alamri Vincent Cartillier Abhishek Das Jue Wang Anoop Cherian Irfan Essa Dhruv Batra Tim K Marks Chiori Hori Peter Anderson et al. 2019a. Audio visual scene-aware dialog. In CVPR.","DOI":"10.1109\/CVPR.2019.00774"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Huda Alamri Vincent Cartillier Abhishek Das Jue Wang Anoop Cherian Irfan Essa Dhruv Batra Tim K. Marks Chiori Hori Peter Anderson Stefan Lee and Devi Parikh. 2019b. Audio-Visual Scene-Aware Dialog. In CVPR.","DOI":"10.1109\/CVPR.2019.00774"},{"key":"e_1_3_2_1_3_1","volume-title":"VQA: Visual Question Answering. In ICCV.","author":"Antol Stanislaw","year":"2015","unstructured":"Stanislaw Antol, Aishwarya Agrawal, Jiasen Lu, Margaret Mitchell, Dhruv Batra, C Lawrence Zitnick, and Devi Parikh. 2015. VQA: Visual Question Answering. In ICCV."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Max Bain Arsha Nagrani G\u00fcl Varol and Andrew Zisserman. 2021. Frozen in time: A joint video and image encoder for end-to-end retrieval. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"e_1_3_2_1_5_1","volume-title":"A CLIP-Hitchhiker's Guide to Long Video Retrieval. arXiv preprint arXiv:2205.08508","author":"Bain Max","year":"2022","unstructured":"Max Bain, Arsha Nagrani, G\u00fcl Varol, and Andrew Zisserman. 2022. A CLIP-Hitchhiker's Guide to Long Video Retrieval. arXiv preprint arXiv:2205.08508 (2022)."},{"key":"e_1_3_2_1_6_1","first-page":"16975","article-title":"Learning audio-visual dynamics using scene graphs for audio source separation","volume":"35","author":"Chatterjee Moitreya","year":"2022","unstructured":"Moitreya Chatterjee, Narendra Ahuja, and Anoop Cherian. 2022. Learning audio-visual dynamics using scene graphs for audio source separation. NeurIPS, Vol. 35 (2022), 16975--16988.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_7_1","volume-title":"Narendra Ahuja, and Anoop Cherian.","author":"Chatterjee Moitreya","year":"2021","unstructured":"Moitreya Chatterjee, Jonathan Le Roux, Narendra Ahuja, and Anoop Cherian. 2021. Visual scene graphs for audio source separation. In ICCV. 1204--1213."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Shizhe Chen Yida Zhao Qin Jin and Qi Wu. 2020. Fine-grained video-text retrieval with hierarchical graph reasoning. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"e_1_3_2_1_9_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NNACL.","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NNACL."},{"key":"e_1_3_2_1_10_1","first-page":"3377","article-title":"Predicting visual features from text for image and video caption retrieval","volume":"20","author":"Dong Jianfeng","year":"2018","unstructured":"Jianfeng Dong, Xirong Li, and Cees GM Snoek. 2018. Predicting visual features from text for image and video caption retrieval. IEEE TMM, Vol. 20, 12 (2018), 3377--3388.","journal-title":"IEEE TMM"},{"key":"e_1_3_2_1_11_1","first-page":"4065","article-title":"Dual encoding for video retrieval by text","volume":"44","author":"Dong Jianfeng","year":"2021","unstructured":"Jianfeng Dong, Xirong Li, Chaoxi Xu, Xun Yang, Gang Yang, Xun Wang, and Meng Wang. 2021. Dual encoding for video retrieval by text. IEEE TPAMI, Vol. 44, 8 (2021), 4065--4080.","journal-title":"IEEE TPAMI"},{"key":"e_1_3_2_1_12_1","volume-title":"Jamie Ryan Kiros, and Sanja Fidler","author":"Faghri Fartash","year":"2017","unstructured":"Fartash Faghri, David J Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2017. Vse: Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612 (2017)."},{"key":"e_1_3_2_1_13_1","unstructured":"Chenyou Fan Xiaofan Zhang Shu Zhang Wensheng Wang Chi Zhang and Heng Huang. 2019. Heterogeneous memory enhanced multimodal attention model for video question answering. In CVPR."},{"key":"e_1_3_2_1_14_1","first-page":"2283","article-title":"Temporal Reasoning via Audio Question Answering","volume":"28","author":"Fayek Haytham M","year":"2020","unstructured":"Haytham M Fayek and Justin Johnson. 2020. Temporal Reasoning via Audio Question Answering. IEEE TASLP, Vol. 28 (2020), 2283--2294.","journal-title":"IEEE TASLP"},{"key":"e_1_3_2_1_15_1","volume-title":"Benoit Huet, and Chong-Wah Ngo.","author":"Francis Danny","year":"2019","unstructured":"Danny Francis, Phuong Anh Nguyen, Benoit Huet, and Chong-Wah Ngo. 2019. Fusion of multimodal embeddings for ad-hoc video search. In ICCVW."},{"key":"e_1_3_2_1_16_1","volume-title":"Lijuan Wang, and Zicheng Liu.","author":"Fu Tsu-Jui","year":"2021","unstructured":"Tsu-Jui Fu, Linjie Li, Zhe Gan, Kevin Lin, William Yang Wang, Lijuan Wang, and Zicheng Liu. 2021. VIOLET: End-to-end video-language transformers with masked visual-token modeling. arXiv preprint arXiv:2111.12681 (2021)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Valentin Gabeur Chen Sun Karteek Alahari and Cordelia Schmid. 2020. Multi-modal transformer for video retrieval. In ECCV.","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"e_1_3_2_1_18_1","first-page":"3","article-title":"Vision-language pre-training: Basics, recent advances, and future trends","volume":"14","author":"Gan Zhe","year":"2022","unstructured":"Zhe Gan, Linjie Li, Chunyuan Li, Lijuan Wang, Zicheng Liu, Jianfeng Gao, et al. 2022. Vision-language pre-training: Basics, recent advances, and future trends. FTCGV, Vol. 14, 3--4 (2022), 163--352.","journal-title":"FTCGV"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Matt Gardner Joel Grus Mark Neumann Oyvind Tafjord Pradeep Dasigi Nelson F. Liu Matthew Peters Michael Schmitz and Luke Zettlemoyer. 2018. AllenNLP: A Deep Semantic Natural Language Processing Platform. In NLP-OSS.","DOI":"10.18653\/v1\/W18-2501"},{"key":"e_1_3_2_1_20_1","volume-title":"Clover: Towards A Unified Video-Language Alignment and Fusion Model. arXiv preprint arXiv:2207.07885","author":"Huang Jingjia","year":"2022","unstructured":"Jingjia Huang, Yinan Li, Jiashi Feng, Xiaoshuai Sun, and Rongrong Ji. 2022. Clover: Towards A Unified Video-Language Alignment and Fusion Model. arXiv preprint arXiv:2207.07885 (2022)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Pin Jiang and Yahong Han. 2020. Reasoning with heterogeneous graph alignment for video question answering. In AAAI.","DOI":"10.1609\/aaai.v34i07.6767"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Vladimir Karpukhin Barlas O\u011fuz Sewon Min Patrick Lewis Ledell Wu Sergey Edunov Danqi Chen and Wen-tau Yih. 2020. Dense Passage Retrieval for Open-Domain Question Answering. In EMNLP.","DOI":"10.18653\/v1\/2020.emnlp-main.550"},{"key":"e_1_3_2_1_23_1","volume-title":"Seong Jong Ha, and Je-Won Kang","author":"Kim Nayoung","year":"2021","unstructured":"Nayoung Kim, Seong Jong Ha, and Je-Won Kang. 2021. Video Question Answering Using Language-Guided Deep Compressed-Domain Video Feature. In ICCV."},{"key":"e_1_3_2_1_24_1","unstructured":"Thao Minh Le Vuong Le Svetha Venkatesh and Truyen Tran. 2020. Hierarchical conditional relation networks for video question answering. In CVPR."},{"key":"e_1_3_2_1_25_1","unstructured":"Sangmin Lee Sungjune Park and Yong Man Ro. 2022a. Audio-Visual Mismatch-Aware Video Retrieval via Association and Adjustment. In ECCV."},{"key":"e_1_3_2_1_26_1","unstructured":"Sangmin Lee Sungjune Park and Yong Man Ro. 2022b. Audio-Visual Mismatch-Aware Video Retrieval via Association and Adjustment. In ECCV."},{"key":"e_1_3_2_1_27_1","volume-title":"Revealing Single Frame Bias for Video-and-Language Learning. arXiv preprint arXiv:2206.03428","author":"Lei Jie","year":"2022","unstructured":"Jie Lei, Tamara L Berg, and Mohit Bansal. 2022. Revealing Single Frame Bias for Video-and-Language Learning. arXiv preprint arXiv:2206.03428 (2022)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Jie Lei Linjie Li Luowei Zhou Zhe Gan Tamara L Berg Mohit Bansal and Jingjing Liu. 2021. Less is more: Clipbert for video-and-language learning via sparse sampling. In CVPR.","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"e_1_3_2_1_29_1","unstructured":"Guangyao Li Yake Wei Yapeng Tian Chenliang Xu Ji-Rong Wen and Di Hu. 2022b. Learning to answer questions in dynamic audio-visual scenarios. In CVPR."},{"key":"e_1_3_2_1_30_1","volume-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML.","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022a. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML."},{"key":"e_1_3_2_1_31_1","volume-title":"Heng Tao Shen, and Jingkuan Song","author":"Li Xiangpeng","year":"2019","unstructured":"Xiangpeng Li, Lianli Gao, Xuanhan Wang, Wu Liu, Xing Xu, Heng Tao Shen, and Jingkuan Song. 2019a. Learnable aggregating net with diversity learning for video question answering. In ACM MM."},{"key":"e_1_3_2_1_32_1","unstructured":"Xiangpeng Li Jingkuan Song Lianli Gao Xianglong Liu Wenbing Huang Xiangnan He and Chuang Gan. 2019b. Beyond rnns: Positional self-attention with co-attention for video question answering. In AAAI."},{"key":"e_1_3_2_1_33_1","unstructured":"Xirong Li Chaoxi Xu Gang Yang Zhineng Chen and Jianfeng Dong. 2019c. W2vv fully deep learning for ad-hoc video search. In ACM MM."},{"key":"e_1_3_2_1_34_1","unstructured":"Hongying Liu Ruyi Luo Fanhua Shang Mantang Niu and Yuanyuan Liu. 2021. Progressive semantic matching for video-text retrieval. In ACM MM."},{"key":"e_1_3_2_1_35_1","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In ICLR."},{"key":"e_1_3_2_1_36_1","volume-title":"Hierarchical question-image co-attention for visual question answering. arXiv preprint arXiv:1606.00061","author":"Lu Jiasen","year":"2016","unstructured":"Jiasen Lu, Jianwei Yang, Dhruv Batra, and Devi Parikh. 2016. Hierarchical question-image co-attention for visual question answering. arXiv preprint arXiv:1606.00061 (2016)."},{"key":"e_1_3_2_1_37_1","unstructured":"Chenyang Lyu Tianbo Ji Yvette Graham and Jennifer Foster. 2023 a. Is a Video worth n n Images? A Highly Efficient Approach to Transformer-based Video Question Answering. In ACLW."},{"key":"e_1_3_2_1_38_1","unstructured":"Chenyang Lyu Tianbo Ji Yvette Graham and Jennifer Foster. 2023 b. Semantic-Aware Dynamic Retrospective-Prospective Reasoning for Event-Level Video Question Answering. In ACL."},{"key":"e_1_3_2_1_39_1","unstructured":"Chenyang Lyu Manh-Duy Nguyen Van-Tu Ninh Liting Zhou Cathal Gurrin and Jennifer Foster. 2023 c. Dialogue-to-Video Retrieval. In ECIR."},{"key":"e_1_3_2_1_40_1","volume-title":"Audio, Video, and Text Integration. arXiv preprint arXiv:2306.09093","author":"Lyu Chenyang","year":"2023","unstructured":"Chenyang Lyu, Minghao Wu, Longyue Wang, Xinting Huang, Bingshuai Liu, Zefeng Du, Shuming Shi, and Zhaopeng Tu. 2023 d. Macaw-LLM: Multi-Modal Language Modeling with Image, Audio, Video, and Text Integration. arXiv preprint arXiv:2306.09093 (2023)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Avinash Madasu Junier Oliva and Gedas Bertasius. 2022. Learning to Retrieve Videos by Asking Questions. In ACM MM.","DOI":"10.1145\/3503161.3548361"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Sho Maeoki Kohei Uehara and Tatsuya Harada. 2020. Interactive video retrieval with dialog. In CVPRW.","DOI":"10.1109\/CVPRW50498.2020.00484"},{"key":"e_1_3_2_1_43_1","unstructured":"Jianguo Mao Wenbin Jiang Xiangdong Wang Zhifan Feng Yajuan Lyu Hong Liu and Yong Zhu. 2022. Dynamic Multistep Reasoning based on Video Scene Graph for Video Question Answering. In NNACL."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1162\/coli.2008.34.2.145"},{"key":"e_1_3_2_1_45_1","volume-title":"Learning a text-video embedding from incomplete and heterogeneous data. arXiv:1804.02516","author":"Miech Antoine","year":"2018","unstructured":"Antoine Miech, Ivan Laptev, and Josef Sivic. 2018. Learning a text-video embedding from incomplete and heterogeneous data. arXiv:1804.02516 (2018)."},{"key":"e_1_3_2_1_46_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2022.01.001"},{"key":"e_1_3_2_1_48_1","volume-title":"Manning","author":"Schuster Sebastian","year":"2015","unstructured":"Sebastian Schuster, Ranjay Krishna, Angel Chang, Li Fei-Fei, and Christopher D. Manning. 2015. Generating Semantically Precise Scene Graphs from Textual Descriptions for Improved Image Retrieval. In EMNLPW."},{"key":"e_1_3_2_1_49_1","volume-title":"Vlg-net: Video-language graph matching network for video grounding. In ICCV. 3224--3234.","author":"Soldan Mattia","year":"2021","unstructured":"Mattia Soldan, Mengmeng Xu, Sisi Qu, Jesper Tegner, and Bernard Ghanem. 2021. Vlg-net: Video-language graph matching network for video grounding. In ICCV. 3224--3234."},{"key":"e_1_3_2_1_50_1","volume-title":"Carl Vondrick, Kevin Murphy, and Cordelia Schmid.","author":"Sun Chen","year":"2019","unstructured":"Chen Sun, Austin Myers, Carl Vondrick, Kevin Murphy, and Cordelia Schmid. 2019. VideoBERT: A joint model for video and language representation learning. In ICCV."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Yapeng Tian Jing Shi Bochen Li Zhiyao Duan and Chenliang Xu. 2018. Audio-visual event localization in unconstrained videos. In ECCV.","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"e_1_3_2_1_52_1","unstructured":"Alex Jinpeng Wang Yixiao Ge Rui Yan Ge Yuying Xudong Lin Guanyu Cai Jianping Wu Ying Shan Xiaohu Qie and Mike Zheng Shou. 2023. All in One: Exploring Unified Video-Language Pre-training. In CVPR."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Xiaohan Wang Linchao Zhu and Yi Yang. 2021. T2VLAD: global-local sequence alignment for text-video retrieval. In CVPR.","DOI":"10.1109\/CVPR46437.2021.00504"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3088863"},{"key":"e_1_3_2_1_55_1","volume-title":"Transformers: State-of-the-art natural language processing. In EMNLP.","author":"Wolf Thomas","year":"2020","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, R\u00e9mi Louf, Morgan Funtowicz, et al. 2020. Transformers: State-of-the-art natural language processing. In EMNLP."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"crossref","unstructured":"Hao Wu Jiayuan Mao Yufeng Zhang Yuning Jiang Lei Li Weiwei Sun and Wei-Ying Ma. 2019. Unified visual-semantic embeddings: Bridging vision and language with structured meaning representations. In CVPR.","DOI":"10.1109\/CVPR.2019.00677"},{"key":"e_1_3_2_1_57_1","first-page":"119","article-title":"Graph neural networks for natural language processing: A survey","volume":"16","author":"Wu Lingfei","year":"2023","unstructured":"Lingfei Wu, Yu Chen, Kai Shen, Xiaojie Guo, Hanning Gao, Shucheng Li, Jian Pei, Bo Long, et al. 2023. Graph neural networks for natural language processing: A survey. FTML, Vol. 16, 2 (2023), 119--328.","journal-title":"FTML"},{"key":"e_1_3_2_1_58_1","first-page":"4","article-title":"A comprehensive survey on graph neural networks","volume":"32","author":"Wu Zonghan","year":"2020","unstructured":"Zonghan Wu, Shirui Pan, Fengwen Chen, Guodong Long, Chengqi Zhang, and S Yu Philip. 2020. A comprehensive survey on graph neural networks. IEEE TNNLS, Vol. 32, 1 (2020), 4--24.","journal-title":"IEEE TNNLS"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20184"},{"key":"e_1_3_2_1_60_1","volume-title":"Videoclip: Contrastive pre-training for zero-shot video-text understanding. In EMNLP.","author":"Xu Hu","year":"2021","unstructured":"Hu Xu, Gargi Ghosh, Po-Yao Huang, Dmytro Okhonko, Armen Aghajanyan, Florian Metze, Luke Zettlemoyer, and Christoph Feichtenhofer. 2021. Videoclip: Contrastive pre-training for zero-shot video-text understanding. In EMNLP."},{"key":"e_1_3_2_1_61_1","volume-title":"Msr-vtt: A large video description dataset for bridging video and language. In CVPR.","author":"Xu Jun","year":"2016","unstructured":"Jun Xu, Tao Mei, Ting Yao, and Yong Rui. 2016. Msr-vtt: A large video description dataset for bridging video and language. In CVPR."},{"key":"e_1_3_2_1_62_1","volume-title":"AVQA: A Dataset for Audio-Visual Question Answering on Videos. In ACM MM.","author":"Yang Pinci","year":"2022","unstructured":"Pinci Yang, Xin Wang, Xuguang Duan, Hong Chen, Runze Hou, Cong Jin, and Wenwu Zhu. 2022. AVQA: A Dataset for Audio-Visual Question Answering on Videos. In ACM MM."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"crossref","unstructured":"Xun Yang Jianfeng Dong Yixin Cao Xun Wang Meng Wang and Tat-Seng Chua. 2020. Tree-augmented cross-modal encoding for complex-query video retrieval. In SIGIR.","DOI":"10.1145\/3397271.3401151"},{"key":"e_1_3_2_1_64_1","volume-title":"HiTeA: Hierarchical Temporal-Aware Video-Language Pre-training. arXiv preprint arXiv:2212.14546","author":"Ye Qinghao","year":"2022","unstructured":"Qinghao Ye, Guohai Xu, Ming Yan, Haiyang Xu, Qi Qian, Ji Zhang, and Fei Huang. 2022. HiTeA: Hierarchical Temporal-Aware Video-Language Pre-training. arXiv preprint arXiv:2212.14546 (2022)."},{"key":"e_1_3_2_1_65_1","unstructured":"Youngjae Yu Jongseok Kim and Gunhee Kim. 2018. A joint sequence fusion model for video question answering and retrieval. In ECCV."},{"key":"e_1_3_2_1_66_1","unstructured":"Zhou Yu Jun Yu Yuhao Cui Dacheng Tao and Qi Tian. 2019. Deep modular co-attention networks for visual question answering. In CVPR."},{"key":"e_1_3_2_1_67_1","unstructured":"Heeseung Yun Youngjae Yu Wonsuk Yang Kangil Lee and Gunhee Kim. 2021. Pano-AVQA: Grounded Audio-Visual Question Answering on 360deg Videos. In ICCV."},{"key":"e_1_3_2_1_68_1","volume-title":"Jize Cao, Ali Farhadi, and Yejin Choi.","author":"Zellers Rowan","year":"2021","unstructured":"Rowan Zellers, Ximing Lu, Jack Hessel, Youngjae Yu, Jae Sung Park, Jize Cao, Ali Farhadi, and Yejin Choi. 2021. MERLOT: Multimodal Neural Script Knowledge Models. In NeurIPS."},{"key":"e_1_3_2_1_69_1","first-page":"63","article-title":"Action-centric relation transformer network for video question answering","volume":"32","author":"Zhang Jipeng","year":"2020","unstructured":"Jipeng Zhang, Jie Shao, Rui Cao, Lianli Gao, Xing Xu, and Heng Tao Shen. 2020. Action-centric relation transformer network for video question answering. IEEE TCSVT, Vol. 32, 1 (2020), 63--74.","journal-title":"IEEE TCSVT"},{"key":"e_1_3_2_1_70_1","volume-title":"Video Question Answering: Datasets, Algorithms and Challenges. arXiv preprint arXiv:2203.01225","author":"Zhong Yaoyao","year":"2022","unstructured":"Yaoyao Zhong, Wei Ji, Junbin Xiao, Yicong Li, Weihong Deng, and Tat-Seng Chua. 2022. Video Question Answering: Datasets, Algorithms and Challenges. arXiv preprint arXiv:2203.01225 (2022)."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"crossref","unstructured":"Peng Zhou Wei Shi Jun Tian Zhenyu Qi Bingchen Li Hongwei Hao and Bo Xu. 2016. Attention-based bidirectional long short-term memory networks for relation classification. In ACL.","DOI":"10.18653\/v1\/P16-2034"},{"key":"e_1_3_2_1_72_1","volume-title":"Actbert: Learning global-local video-text representations. In CVPR.","author":"Zhu Linchao","year":"2020","unstructured":"Linchao Zhu and Yi Yang. 2020. Actbert: Learning global-local video-text representations. In CVPR."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612132","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612132","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:54:30Z","timestamp":1755820470000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612132"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":72,"alternative-id":["10.1145\/3581783.3612132","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612132","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}