{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:43:24Z","timestamp":1777657404299,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"University Synergy Innovation Program of Anhui Province","award":["GXXT-2022-040"],"award-info":[{"award-number":["GXXT-2022-040"]}]},{"name":"National Natural Science Foundation of China","award":["9227010114"],"award-info":[{"award-number":["9227010114"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612577","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"3172-3180","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":20,"title":["Redundancy-aware Transformer for Video Question Answering"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5659-793X","authenticated-orcid":false,"given":"Yicong","family":"Li","sequence":"first","affiliation":[{"name":"National University of Singapore, singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0201-1638","authenticated-orcid":false,"given":"Xun","family":"Yang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1367-711X","authenticated-orcid":false,"given":"An","family":"Zhang","sequence":"additional","affiliation":[{"name":"National University of Singapore, singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6209-1664","authenticated-orcid":false,"given":"Chun","family":"Feng","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6148-6329","authenticated-orcid":false,"given":"Xiang","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6097-7807","authenticated-orcid":false,"given":"Tat-Seng","family":"Chua","sequence":"additional","affiliation":[{"name":"National University of Singapore, singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Shyamal Buch Crist\u00f3bal Eyzaguirre Adrien Gaidon Jiajun Wu Li Fei-Fei and Juan Carlos Niebles. 2022. Revisiting the \"Video\" in Video-Language Understanding. In CVPR. 2917--2927.","DOI":"10.1109\/CVPR52688.2022.00293"},{"key":"e_1_3_2_1_2_1","unstructured":"R\u00e9mi Cad\u00e8ne Corentin Dancette Hedi Ben-younes Matthieu Cord and Devi Parikh. 2019. RUBi: Reducing Unimodal Biases for Visual Question Answering. In NeurIPS. 839--850."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Long Chen Xin Yan Jun Xiao Hanwang Zhang Shiliang Pu and Yueting Zhuang. 2020. Counterfactual Samples Synthesizing for Robust Visual Question Answering. In CVPR. 10797--10806.","DOI":"10.1109\/CVPR42600.2020.01081"},{"key":"e_1_3_2_1_4_1","volume-title":"Vuong Le, and Truyen Tran.","author":"Dang Long Hoang","year":"2021","unstructured":"Long Hoang Dang, Thao Minh Le, Vuong Le, and Truyen Tran. 2021. Hierarchical Object-oriented Spatio-Temporal Reasoning for Video Question Answering. In IJCAI. 636--642."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547976"},{"key":"e_1_3_2_1_6_1","first-page":"4065","article-title":"Dual encoding for video retrieval by text","volume":"44","author":"Dong Jianfeng","year":"2021","unstructured":"Jianfeng Dong, Xirong Li, Chaoxi Xu, Xun Yang, Gang Yang, Xun Wang, and Meng Wang. 2021. Dual encoding for video retrieval by text. IEEE Transactions on Pattern Analysis and Machine Intelligence 44, 8 (2021), 4065--4080.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_7_1","volume-title":"Sai Srinivas Kancheti, and Vineeth N. Balasubramanian","author":"Dua Radhika","year":"2021","unstructured":"Radhika Dua, Sai Srinivas Kancheti, and Vineeth N. Balasubramanian. 2021. Beyond VQA: Generating Multi-Word Answers and Rationales to Visual Questions. In CVPR. 1623--1632."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Michael Figurnov Maxwell D. Collins Yukun Zhu Li Zhang Jonathan Huang Dmitry P. Vetrov and Ruslan Salakhutdinov. 2017. Spatially Adaptive Computation Time for Residual Networks. In CVPR. 1790--1799.","DOI":"10.1109\/CVPR.2017.194"},{"key":"e_1_3_2_1_9_1","unstructured":"Pengcheng He Xiaodong Liu Jianfeng Gao and Weizhu Chen. 2021. Deberta: decoding-Enhanced Bert with Disentangled Attention. In ICLR."},{"key":"e_1_3_2_1_10_1","volume-title":"Tgif-qa: Toward spatio-temporal reasoning in visual question answering. In CVPR. 2758--2766.","author":"Jang Yunseok","year":"2017","unstructured":"Yunseok Jang, Yale Song, Youngjae Yu, Youngjin Kim, and Gunhee Kim. 2017. Tgif-qa: Toward spatio-temporal reasoning in visual question answering. In CVPR. 2758--2766."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3479232"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Wei Ji Renjie Liang Lizi Liao Hao Fei and Fuli Feng. 2023. Partial Annotation-based Video Moment Retrieval via Iterative Learning. In ACM MM.","DOI":"10.1145\/3581783.3612088"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Wei Ji Renjie Liang Zhedong Zheng Wenqiao Zhang Shengyu Zhang Juncheng Li Mengze Li and Tat-seng Chua. 2023. Are binary annotations sufficient? video moment retrieval via hierarchical uncertainty-based active learning. In CVPR. 23013--23022.","DOI":"10.1109\/CVPR52729.2023.02204"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Pin Jiang and Yahong Han. 2020. Reasoning with Heterogeneous Graph Alignment for Video Question Answering. In AAAI. 11109--11116.","DOI":"10.1609\/aaai.v34i07.6767"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Thao Minh Le Vuong Le Svetha Venkatesh and Truyen Tran. 2020. Hierarchical Conditional Relation Networks for Video Question Answering. In CVPR. 9969--9978.","DOI":"10.1109\/CVPR42600.2020.00999"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Jiangtong Li Li Niu and Liqing Zhang. 2022. From Representation to Reasoning: Towards both Evidence and Commonsense Reasoning for Video Question-Answering. In CVPR. 21241--21250.","DOI":"10.1109\/CVPR52688.2022.02059"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3587251"},{"key":"e_1_3_2_1_18_1","volume-title":"Equivariant and Invariant Grounding for Video Question Answering. CoRR abs\/2207.12783","author":"Li Yicong","year":"2022","unstructured":"Yicong Li, Xiang Wang, Junbin Xiao, and Tat-Seng Chua. 2022. Equivariant and Invariant Grounding for Video Question Answering. CoRR abs\/2207.12783 (2022)."},{"key":"e_1_3_2_1_19_1","unstructured":"Yicong Li Xiang Wang Junbin Xiao Wei Ji and Tat-Seng Chua. 2022. Invariant Grounding for Video Question Answering. In CVPR. 2928--2937."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Yicong Li Xun Yang Xindi Shang and Tat-Seng Chua. 2021. Interventional Video Relation Detection. In ACM MM. 4091--4099.","DOI":"10.1145\/3474085.3475540"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Yicong Li Xun Yang Xindi Shang and Tat-Seng Chua. 2021. Interventional video relation detection. In ACM MM. 4091--4099.","DOI":"10.1145\/3474085.3475540"},{"key":"e_1_3_2_1_22_1","volume-title":"Chung-Ching Lin, Alex J. Andonian, Yue Meng, Kate Saenko, Aude Oliva, and Rog\u00e9rio Feris.","author":"Pan Bowen","year":"2021","unstructured":"Bowen Pan, Rameswar Panda, Camilo Luciano Fosco, Chung-Ching Lin, Alex J. Andonian, Yue Meng, Kate Saenko, Aude Oliva, and Rog\u00e9rio Feris. 2021. VA-RED2: Video Adaptive Redundancy Reduction. In ICLR."},{"key":"e_1_3_2_1_23_1","volume-title":"Zeynep Akata, Anna Rohrbach","author":"Park Dong Huk","year":"2018","unstructured":"Dong Huk Park, Lisa Anne Hendricks, Zeynep Akata, Anna Rohrbach, Bernt Schiele, Trevor Darrell, and Marcus Rohrbach. 2018. Multimodal Explanations: Justifying Decisions and Pointing to the Evidence. In CVPR. 8779--8788."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Jungin Park Jiyoung Lee and Kwanghoon Sohn. 2021. Bridge To Answer: Structure-Aware Graph Interaction Network for Video Question Answering. In CVPR. 15526--15535.","DOI":"10.1109\/CVPR46437.2021.01527"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Liang Peng Shuangji Yang Yi Bin and Guoqing Wang. 2021. Progressive Graph Attention Network for Video Question Answering. In ACM MM.","DOI":"10.1145\/3474085.3475193"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Min Peng Chongyang Wang Yuan Gao Yu Shi and Xiang-Dong Zhou. 2022. Multilevel Hierarchical Network with Multiscale Sampling for Video Question Answering. In IJCAI. 1276--1282.","DOI":"10.24963\/ijcai.2022\/178"},{"key":"e_1_3_2_1_27_1","unstructured":"Yongming Rao Wenliang Zhao Benlin Liu Jiwen Lu Jie Zhou and Cho-Jui Hsieh. 2021. DynamicViT: Efficient Vision Transformers with Dynamic Token Sparsification. In NeurIPS Marc'Aurelio Ranzato Alina Beygelzimer Yann N. Dauphin Percy Liang and Jennifer Wortman Vaughan (Eds.). 13937--13949."},{"key":"e_1_3_2_1_28_1","unstructured":"Shaoqing Ren Kaiming He Ross B. Girshick and Jian Sun. 2015. Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. In NeurIPS. 91--99."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Marco T\u00falio Ribeiro Sameer Singh and Carlos Guestrin. 2016. \"Why Should I Trust You?\": Explaining the Predictions of Any Classifier. In KDD Balaji Krishnapuram Mohak Shah Alexander J. Smola Charu C. Aggarwal Dou Shen and Rajeev Rastogi (Eds.). 1135--1144.","DOI":"10.1145\/2939672.2939778"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Andrew Slavin Ross Michael C. Hughes and Finale Doshi-Velez. 2017. Right for the Right Reasons: Training Differentiable Models by Constraining their Explanations. In IJCAI. 2662--2670.","DOI":"10.24963\/ijcai.2017\/371"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3323873.3325056"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Xindi Shang Yicong Li Junbin Xiao Wei Ji and Tat-Seng Chua. 2021. Video Visual Relation Detection via Iterative Inference. In ACM MM. 3654--3663.","DOI":"10.1145\/3474085.3475263"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475218"},{"key":"e_1_3_2_1_34_1","volume-title":"Deep Learning and the Information Bottleneck Principle. CoRR abs\/1503.02406","author":"Tishby Naftali","year":"2015","unstructured":"Naftali Tishby and Noga Zaslavsky. 2015. Deep Learning and the Information Bottleneck Principle. CoRR abs\/1503.02406 (2015)."},{"key":"e_1_3_2_1_35_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention is All you Need. In NeurIPS. 5998--6008."},{"key":"e_1_3_2_1_36_1","unstructured":"Junke Wang Xitong Yang Hengduo Li Li Liu Zuxuan Wu and Yu-Gang Jiang."},{"key":"e_1_3_2_1_37_1","unstructured":". Efficient Video Transformers with Spatial-Temporal Token Selection. In ECCV."},{"key":"e_1_3_2_1_38_1","unstructured":"Yingxin Wu Xiang Wang An Zhang Xiangnan He and Tat-Seng Chua. 2022. Discovering Invariant Rationales for Graph Neural Networks. In ICLR."},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings, Part VI 16","author":"Xiao Junbin","year":"2020","unstructured":"Junbin Xiao, Xindi Shang, Xun Yang, Sheng Tang, and Tat-Seng Chua. 2020. Visual relation grounding in videos. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part VI 16. Springer, 447--464."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Junbin Xiao Xindi Shang Angela Yao and Tat-Seng Chua. 2021. NExT-QA: Next Phase of Question-Answering to Explaining Temporal Actions. In CVPR. 9777--9786.","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Junbin Xiao Angela Yao Zhiyuan Liu Yicong Li Wei Ji and Tat-Seng Chua. 2022. Video as Conditional Graph Hierarchy for Multi-Granular Question Answering. In AAAI. 2804--2812.","DOI":"10.1609\/aaai.v36i3.20184"},{"key":"e_1_3_2_1_42_1","volume-title":"Video Graph Transformer for Video Question Answering","author":"Xiao Junbin","unstructured":"Junbin Xiao, Pan Zhou, Tat-Seng Chua, and Shuicheng Yan. 2022. Video Graph Transformer for Video Question Answering. In ECCV. Springer, 39--58."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Dejing Xu Zhou Zhao Jun Xiao Fei Wu Hanwang Zhang Xiangnan He and Yueting Zhuang. 2017. Video Question Answering via Gradually Refined Attention over Appearance and Motion. In ACM MM. 1645--1653.","DOI":"10.1145\/3123266.3123427"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462823"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413610"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3140611"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Hao Zhang Lechao Cheng Yanbin Hao and Chong-wah Ngo. 2022. Long-term leap attention short-term periodic shift for video classification. In ACM MM. 5773--5782.","DOI":"10.1145\/3503161.3547908"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Quanshi Zhang Yu Yang Haotian Ma and Ying Nian Wu. 2019. Interpreting CNNs via Decision Trees. In CVPR. 6261--6270.","DOI":"10.1109\/CVPR.2019.00642"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Yaoyao Zhong. 2022. Video Question Answering: Datasets Algorithms and Challenges. EMNLP (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.432"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612577","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612577","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:58:54Z","timestamp":1755820734000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612577"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":49,"alternative-id":["10.1145\/3581783.3612577","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612577","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}