{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,25]],"date-time":"2026-06-25T16:09:35Z","timestamp":1782403775527,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T00:00:00Z","timestamp":1665360000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["No. 2018AAA0102001"],"award-info":[{"award-number":["No. 2018AAA0102001"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 62250008, No. 62102222."],"award-info":[{"award-number":["No. 62250008, No. 62102222."]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,10,10]]},"DOI":"10.1145\/3503161.3548291","type":"proceedings-article","created":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T15:42:46Z","timestamp":1665416566000},"page":"3480-3491","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":57,"title":["AVQA: A Dataset for Audio-Visual Question Answering on Videos"],"prefix":"10.1145","author":[{"given":"Pinci","family":"Yang","sequence":"first","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xin","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xuguang","family":"Duan","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hong","family":"Chen","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Runze","family":"Hou","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Cong","family":"Jin","sequence":"additional","affiliation":[{"name":"Communication University of China, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wenwu","family":"Zhu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2022,10,10]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Seeing sounds: visual and auditory interactions in the brain. Current opinion in neurobiology","author":"Bulkin David A","year":"2006","unstructured":"David A Bulkin and Jennifer M Groh . 2006. Seeing sounds: visual and auditory interactions in the brain. Current opinion in neurobiology , Vol. 16 , 4 ( 2006 ), 415--419. David A Bulkin and Jennifer M Groh. 2006. Seeing sounds: visual and auditory interactions in the brain. Current opinion in neurobiology, Vol. 16, 4 (2006), 415--419."},{"key":"e_1_3_2_2_2_1","volume-title":"Proceedings of the 12th Language Resources and Evaluation Conference. European Language Resources Association","author":"Castro Santiago","year":"2020","unstructured":"Santiago Castro , Mahmoud Azab , Jonathan Stroud , Cristina Noujaim , Ruoyao Wang , Jia Deng , and Rada Mihalcea . 2020 . LifeQA: A Real-life Dataset for Video Question Answering . In Proceedings of the 12th Language Resources and Evaluation Conference. European Language Resources Association , Marseille, France, 4352--4358. https:\/\/aclanthology.org\/ 2020.lrec-1.536 Santiago Castro, Mahmoud Azab, Jonathan Stroud, Cristina Noujaim, Ruoyao Wang, Jia Deng, and Rada Mihalcea. 2020. LifeQA: A Real-life Dataset for Video Question Answering. In Proceedings of the 12th Language Resources and Evaluation Conference. European Language Resources Association, Marseille, France, 4352--4358. https:\/\/aclanthology.org\/2020.lrec-1.536"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"e_1_3_2_2_4_1","volume-title":"Dramaqa: Character-centered video story understanding with hierarchical qa. arXiv preprint arXiv:2005.03356","author":"Choi Seongho","year":"2020","unstructured":"Seongho Choi , Kyoung-Woon On , Yu-Jung Heo , Ahjeong Seo , Youwon Jang , Minsu Lee , and Byoung-Tak Zhang . 2020 . Dramaqa: Character-centered video story understanding with hierarchical qa. arXiv preprint arXiv:2005.03356 (2020). Seongho Choi, Kyoung-Woon On, Yu-Jung Heo, Ahjeong Seo, Youwon Jang, Minsu Lee, and Byoung-Tak Zhang. 2020. Dramaqa: Character-centered video story understanding with hierarchical qa. arXiv preprint arXiv:2005.03356 (2020)."},{"key":"e_1_3_2_2_5_1","volume-title":"Daisy Zhe Wang, and Doo Soon Kim","author":"Colas Anthony","year":"2019","unstructured":"Anthony Colas , Seokhwan Kim , Franck Dernoncourt , Siddhesh Gupte , Daisy Zhe Wang, and Doo Soon Kim . 2019 . TutorialVQA : Question answering dataset for tutorial videos. arXiv preprint arXiv:1912.01046 (2019). Anthony Colas, Seokhwan Kim, Franck Dernoncourt, Siddhesh Gupte, Daisy Zhe Wang, and Doo Soon Kim. 2019. TutorialVQA: Question answering dataset for tutorial videos. arXiv preprint arXiv:1912.01046 (2019)."},{"key":"e_1_3_2_2_6_1","volume-title":"Advances in Neural Information Processing Systems","volume":"31","author":"Duan Xuguang","year":"2018","unstructured":"Xuguang Duan , Wenbing Huang , Chuang Gan , Jingdong Wang , Wenwu Zhu , and Junzhou Huang . 2018 . Weakly supervised dense event captioning in videos . Advances in Neural Information Processing Systems , Vol. 31 (2018). Xuguang Duan, Wenbing Huang, Chuang Gan, Jingdong Wang, Wenwu Zhu, and Junzhou Huang. 2018. Weakly supervised dense event captioning in videos. Advances in Neural Information Processing Systems, Vol. 31 (2018)."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00210"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00688"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6713"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01113"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01113"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3051756"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00685"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6737"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1167\/19.11.1"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.149"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6767"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3076556"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00853"},{"key":"e_1_3_2_2_21_1","volume-title":"Deepstory: Video story qa by deep embedded memory networks. arXiv preprint arXiv:1707.00836","author":"Kim Kyung-Min","year":"2017","unstructured":"Kyung-Min Kim , Min-Oh Heo , Seong-Ho Choi , and Byoung-Tak Zhang . 2017 . Deepstory: Video story qa by deep embedded memory networks. arXiv preprint arXiv:1707.00836 (2017). Kyung-Min Kim, Min-Oh Heo, Seong-Ho Choi, and Byoung-Tak Zhang. 2017. Deepstory: Video story qa by deep embedded memory networks. arXiv preprint arXiv:1707.00836 (2017)."},{"key":"e_1_3_2_2_22_1","volume-title":"Plumbley","author":"Kong Qiuqiang","year":"2019","unstructured":"Qiuqiang Kong , Yin Cao , Turab Iqbal , Yuxuan Wang , Wenwu Wang , and Mark D . Plumbley . 2019 . PANNs: Large- Scale Pretrained Audio Neural Networks for Audio Pattern Recognition . (2019). http:\/\/arxiv.org\/abs\/1912.10211 Qiuqiang Kong, Yin Cao, Turab Iqbal, Yuxuan Wang, Wenwu Wang, and Mark D. Plumbley. 2019. PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition. (2019). http:\/\/arxiv.org\/abs\/1912.10211"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00999"},{"key":"e_1_3_2_2_24_1","volume-title":"TVQA: Localized, Compositional Video Question Answering. In EMNLP.","author":"Lei Jie","year":"2018","unstructured":"Jie Lei , Licheng Yu , Mohit Bansal , and Tamara L Berg . 2018 . TVQA: Localized, Compositional Video Question Answering. In EMNLP. Jie Lei, Licheng Yu, Mohit Bansal, and Tamara L Berg. 2018. TVQA: Localized, Compositional Video Question Answering. In EMNLP."},{"key":"e_1_3_2_2_25_1","volume-title":"TVQA: Spatio-Temporal Grounding for Video Question Answering. In Tech Report, arXiv.","author":"Lei Jie","year":"2019","unstructured":"Jie Lei , Licheng Yu , Tamara L Berg , and Mohit Bansal . 2019 . TVQA: Spatio-Temporal Grounding for Video Question Answering. In Tech Report, arXiv. Jie Lei, Licheng Yu, Tamara L Berg, and Mohit Bansal. 2019. TVQA: Spatio-Temporal Grounding for Video Question Answering. In Tech Report, arXiv."},{"key":"e_1_3_2_2_26_1","volume-title":"Learning to Answer Questions in Dynamic Audio-Visual Scenarios. arXiv preprint arXiv:2203.14072","author":"Li Guangyao","year":"2022","unstructured":"Guangyao Li , Yake Wei , Yapeng Tian , Chenliang Xu , Ji-Rong Wen , and Di Hu. 2022c. Learning to Answer Questions in Dynamic Audio-Visual Scenarios. arXiv preprint arXiv:2203.14072 ( 2022 ). Guangyao Li, Yake Wei, Yapeng Tian, Chenliang Xu, Ji-Rong Wen, and Di Hu. 2022c. Learning to Answer Questions in Dynamic Audio-Visual Scenarios. arXiv preprint arXiv:2203.14072 (2022)."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02059"},{"key":"e_1_3_2_2_28_1","volume-title":"Hero: Hierarchical encoder for video language omni-representation pre-training. arXiv preprint arXiv:2005.00200","author":"Li Linjie","year":"2020","unstructured":"Linjie Li , Yen-Chun Chen , Yu Cheng , Zhe Gan , Licheng Yu , and Jingjing Liu . 2020 . Hero: Hierarchical encoder for video language omni-representation pre-training. arXiv preprint arXiv:2005.00200 (2020). Linjie Li, Yen-Chun Chen, Yu Cheng, Zhe Gan, Licheng Yu, and Jingjing Liu. 2020. Hero: Hierarchical encoder for video language omni-representation pre-training. arXiv preprint arXiv:2005.00200 (2020)."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350971"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018658"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00294"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO54536.2021.9616257"},{"key":"e_1_3_2_2_33_1","volume-title":"Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing. 1532--1543","author":"Pennington Jeffrey","unstructured":"Jeffrey Pennington , Richard Socher , and Christopher D. Manning . 2014. Glove: Global Vectors for Word Representation . In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing. 1532--1543 . Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. Glove: Global Vectors for Word Representation. In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing. 1532--1543."},{"key":"e_1_3_2_2_34_1","volume-title":"Benefits of multisensory learning. Trends in cognitive sciences","author":"Shams Ladan","year":"2008","unstructured":"Ladan Shams and Aaron R Seitz . 2008. Benefits of multisensory learning. Trends in cognitive sciences , Vol. 12 , 11 ( 2008 ), 411--417. Ladan Shams and Aaron R Seitz. 2008. Benefits of multisensory learning. Trends in cognitive sciences, Vol. 12, 11 (2008), 411--417."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413909"},{"key":"e_1_3_2_2_36_1","unstructured":"SVQA-founder. 2018. Synthetic Video Question Answering. https:\/\/github.com\/SVQA-founder\/SVQA.  SVQA-founder. 2018. Synthetic Video Question Answering. https:\/\/github.com\/SVQA-founder\/SVQA."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.501"},{"key":"e_1_3_2_2_38_1","volume-title":"Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2).","author":"Wu Bo","year":"2021","unstructured":"Bo Wu , Shoubin Yu , Zhenfang Chen , Joshua B Tenenbaum , and Chuang Gan . 2021 . STAR: A benchmark for situated reasoning in real-world videos . In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2). Bo Wu, Shoubin Yu, Zhenfang Chen, Joshua B Tenenbaum, and Chuang Gan. 2021. STAR: A benchmark for situated reasoning in real-world videos. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123427"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2746267"},{"key":"e_1_3_2_2_43_1","volume-title":"Clevrer: Collision events for video representation and reasoning. arXiv preprint arXiv:1910.01442","author":"Yi Kexin","year":"2019","unstructured":"Kexin Yi , Chuang Gan , Yunzhu Li , Pushmeet Kohli , Jiajun Wu , Antonio Torralba , and Joshua B Tenenbaum . 2019 . Clevrer: Collision events for video representation and reasoning. arXiv preprint arXiv:1910.01442 (2019). Kexin Yi, Chuang Gan, Yunzhu Li, Pushmeet Kohli, Jiajun Wu, Antonio Torralba, and Joshua B Tenenbaum. 2019. Clevrer: Collision events for video representation and reasoning. arXiv preprint arXiv:1910.01442 (2019)."},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019127"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00204"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00901"},{"key":"e_1_3_2_2_47_1","volume-title":"Action-Centric Relation Transformer Network for Video Question Answering","author":"Zhang Jipeng","year":"2020","unstructured":"Jipeng Zhang , Jie Shao , Rui Cao , Lianli Gao , Xing Xu , and Heng Tao Shen . 2020. Action-Centric Relation Transformer Network for Video Question Answering . IEEE Transactions on Circuits and Systems for Video Technology ( 2020 ). Jipeng Zhang, Jie Shao, Rui Cao, Lianli Gao, Xing Xu, and Heng Tao Shen. 2020. Action-Centric Relation Transformer Network for Video Question Answering. IEEE Transactions on Circuits and Systems for Video Technology (2020)."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-017-1033-7"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3366710"}],"event":{"name":"MM '22: The 30th ACM International Conference on Multimedia","location":"Lisboa Portugal","acronym":"MM '22","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 30th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548291","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3503161.3548291","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:00:43Z","timestamp":1750186843000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548291"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,10]]},"references-count":49,"alternative-id":["10.1145\/3503161.3548291","10.1145\/3503161"],"URL":"https:\/\/doi.org\/10.1145\/3503161.3548291","relation":{},"subject":[],"published":{"date-parts":[[2022,10,10]]},"assertion":[{"value":"2022-10-10","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}