{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,18]],"date-time":"2026-04-18T16:22:31Z","timestamp":1776529351912,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3613909","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"5195-5203","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Language-Guided Visual Aggregation Network for Video Question Answering"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0382-2715","authenticated-orcid":false,"given":"Xiao","family":"Liang","sequence":"first","affiliation":[{"name":"Xidian University, Xian, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8027-4287","authenticated-orcid":false,"given":"Di","family":"Wang","sequence":"additional","affiliation":[{"name":"Xidian University, Xian, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3410-9560","authenticated-orcid":false,"given":"Quan","family":"Wang","sequence":"additional","affiliation":[{"name":"Xidian University, Xian, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6913-8604","authenticated-orcid":false,"given":"Bo","family":"Wan","sequence":"additional","affiliation":[{"name":"Xidian University, Xian, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0103-489X","authenticated-orcid":false,"given":"Lingling","family":"An","sequence":"additional","affiliation":[{"name":"Xidian University, Xian, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0555-3574","authenticated-orcid":false,"given":"Lihuo","family":"He","sequence":"additional","affiliation":[{"name":"Xidian University, Xian, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00293"},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence.","author":"Cherian Anoop","year":"2022","unstructured":"Anoop Cherian, Chiori Hori, Tim K. Marks, and Jonathan Le Roux. 2022. (2.51)d spatio-temporal scene graphs for video question answering. In Proceedings of the AAAI Conference on Artificial Intelligence."},{"key":"e_1_3_2_1_4_1","unstructured":"Jacob Devlin Ming-Wei Chang Kenton Lee and Kristina Toutanova. 2019. Bert: pre-training of deep bidirectional transformers for language understanding. ArXiv abs\/1810.04805."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00210"},{"key":"e_1_3_2_1_6_1","unstructured":"Han Fang Pengfei Xiong Luhui Xu and Yu Chen. 2021. Clip2video: mastering video-text retrieval via image clip. ArXiv abs\/2106.11097."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00688"},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 770--778","author":"He Kaiming","year":"2015","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2015. Deep residual learning for image recognition. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 770--778."},{"key":"e_1_3_2_1_9_1","unstructured":"Jingjia Huang Yinan Li Jiashi Feng Xiaoshuai Sun and Rongrong Ji. 2022. Clover: towards a unified video-language alignment and fusion model. ArXiv abs\/2207.07885."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.149"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6767"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_14_1","volume-title":"2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 9969--9978","author":"Le Thao Minh","unstructured":"Thao Minh Le, Vuong Le, Svetha Venkatesh, and T. Tran. 2020. Hierarchical conditional relation networks for video question answering. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 9969--9978."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"e_1_3_2_1_16_1","volume-title":"2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 4943--4953","author":"Li Dongxu","unstructured":"Dongxu Li, Junnan Li, Hongdong Li, Juan Carlos Niebles, and Steven C. H. Hoi. 2021. Align and prompt: video-and-language pre-training with entity prompts. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 4943--4953."},{"key":"e_1_3_2_1_17_1","volume-title":"International Conference on Machine Learning.","author":"Li Junnan","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven C. H. Hoi. 2022. Blip: boot-strapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning."},{"key":"e_1_3_2_1_18_1","volume-title":"Shafiq R. Joty, Caiming Xiong, and Steven C. H. Hoi.","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath R. Selvaraju, Akhilesh Deepak Gotmare, Shafiq R. Joty, Caiming Xiong, and Steven C. H. Hoi. 2021. Align before fuse: vision and language representation learning with momentum distillation. ArXiv, abs\/2107.07651."},{"key":"e_1_3_2_1_19_1","volume-title":"Conference on Empirical Methods in Natural Language Processing.","author":"Li Linjie","year":"2020","unstructured":"Linjie Li, Yen-Chun Chen, Yu Cheng, Zhe Gan, Licheng Yu, and Jingjing Liu. 2020. Hero: hierarchical encoder for videolanguage omni-representation pretraining. In Conference on Empirical Methods in Natural Language Processing."},{"key":"e_1_3_2_1_20_1","volume-title":"2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 10955--10965","author":"Harold Liunian","unstructured":"Liunian Harold Li et al. 2021. Grounded language-image pre-training. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 10955--10965."},{"key":"e_1_3_2_1_21_1","unstructured":"Yi Li Hualiang Wang Yiqun Duan and X. Li. 2023. Clip surgery for better explainability with enhancement in open-vocabulary tasks. In."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00294"},{"key":"e_1_3_2_1_23_1","volume-title":"Heng Ji, and Shih-Fu Chang.","author":"Lin Xudong","year":"2022","unstructured":"Xudong Lin, Simran Tiwari, Shiyuan Huang, Manling Li, Mike Zheng Shou, Heng Ji, and Shih-Fu Chang. 2022. Towards fast adaptation of pretrained contrastive models for multi-channel video-language retrieval. ArXiv, abs\/2206.02082."},{"key":"e_1_3_2_1_24_1","unstructured":"Yinhan Liu et al. 2019. Roberta: a robustly optimized bert pretraining approach. ArXiv abs\/1907.11692."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475193"},{"key":"e_1_3_2_1_27_1","volume-title":"Yu Shi Yuan Gao, and Xiang-Dong Zhou","author":"Peng Min","year":"2022","unstructured":"Min Peng, Chongyang Wang, Yu Shi Yuan Gao, and Xiang-Dong Zhou. 2022. Multilevel hierarchical network with multiscale sampling for video question answering. ArXiv, abs\/2205.04061."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_5"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548061"},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML).","author":"Alec","unstructured":"Alec Radford et al. 2021. Learning transferable visual models from natural language supervision. In Proceedings of the International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.421"},{"key":"e_1_3_2_1_34_1","volume-title":"2015 IEEE International Conference on Computer Vision (ICCV), 4489--4497","author":"Tran Du","year":"2014","unstructured":"Du Tran, Lubomir D. Bourdev, Rob Fergus, Lorenzo Torresani, and Manohar Paluri. 2014. Learning spatiotemporal features with 3d convolutional networks. 2015 IEEE International Conference on Computer Vision (ICCV), 4489--4497."},{"key":"e_1_3_2_1_35_1","unstructured":"Alex Wang et al. 2022. All in one: exploring unified video-language pre-training. ArXiv abs\/2203.07303."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475620"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence.","author":"Xiao Junbin","year":"2021","unstructured":"Junbin Xiao, Angela Yao, Zhiyuan Liu, Yicong Li, Wei Ji, and Tat-Seng Chua. 2021. Video as conditional graph hierarchy for multi-granular question answering. In Proceedings of the AAAI Conference on Artificial Intelligence."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_3"},{"key":"e_1_3_2_1_40_1","volume-title":"International Conference on Machine Learning.","author":"Ruibin","unstructured":"Ruibin Xiong et al. 2020. On layer normalization in the transformer architecture. In International Conference on Machine Learning."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123427"},{"key":"e_1_3_2_1_42_1","volume-title":"2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 5026--5035","author":"Xue Hongwei","year":"2021","unstructured":"Hongwei Xue, Tiankai Hang, Yanhong Zeng, Yuchong Sun, Bei Liu, Huan Yang, Jianlong Fu, and Baining Guo. 2021. Advancing high-resolution video-language representation with large-scale video transcriptions. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 5026--5035."},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of the International Conference on Computer Vision (ICCV), 1666--1677","author":"Yang Antoine","year":"2020","unstructured":"Antoine Yang, Antoine Miech, Josef Sivic, Ivan Laptev, and Cordelia Schmid. 2020. Just ask: learning to answer questions from millions of narrated videos. In Proceedings of the International Conference on Computer Vision (ICCV), 1666--1677."},{"key":"e_1_3_2_1_44_1","unstructured":"Antoine Yang Antoine Miech Josef Sivic Ivan Laptev and Cordelia Schmid. 2022. Zero-shot video question answering via frozen bidirectional language models. ArXiv abs\/2206.08155."},{"key":"e_1_3_2_1_45_1","unstructured":"Jiahui Yu Zirui Wang Vijay Vasudevan Legg Yeung Mojtaba Seyedhosseini and Yonghui Wu. 2022. Coca: contrastive captioners are image-text foundation models. ArXiv abs\/2205.01917."},{"key":"e_1_3_2_1_46_1","unstructured":"Zhou Yu D. Xu Jun Yu Ting Yu Zhou Zhao Yueting Zhuang and Dacheng Tao. 2019. Activitynet-qa: a dataset for understanding complex web videos via question answering. ArXiv abs\/1906.02467."},{"key":"e_1_3_2_1_47_1","volume-title":"Jize Cao, Ali Farhadi, and Yejin Choi.","author":"Zellers Rowan","year":"2021","unstructured":"Rowan Zellers, Ximing Lu, Jack Hessel, Youngjae Yu, Jae Sung Park, Jize Cao, Ali Farhadi, and Yejin Choi. 2021. Merlot: multimodal neural script knowledge models. In Advances in Neural Information Processing Systems (NIPS)."},{"key":"e_1_3_2_1_48_1","volume-title":"Jae Sung Park, and Yejin Choi","author":"Zellers Rowan","year":"2021","unstructured":"Rowan Zellers, Ximing Lu, Jack Hessel, Youngjae Yu, Jae Sung Park, and Yejin Choi. 2021. Multimodal neural script knowledge models. In."},{"key":"e_1_3_2_1_49_1","unstructured":"Andy Zeng et al. 2022. Socratic models: composing zero-shot multimodal reasoning with language. ArXiv abs\/2204.00598."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613909","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3613909","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:10:14Z","timestamp":1755821414000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613909"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":49,"alternative-id":["10.1145\/3581783.3613909","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3613909","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}