{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T17:30:31Z","timestamp":1770917431182,"version":"3.50.1"},"reference-count":34,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"5","license":[{"start":{"date-parts":[[2024,5,1]],"date-time":"2024-05-01T00:00:00Z","timestamp":1714521600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,5,1]],"date-time":"2024-05-01T00:00:00Z","timestamp":1714521600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,5,1]],"date-time":"2024-05-01T00:00:00Z","timestamp":1714521600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Circuits Syst. Video Technol."],"published-print":{"date-parts":[[2024,5]]},"DOI":"10.1109\/tcsvt.2023.3318220","type":"journal-article","created":{"date-parts":[[2023,10,2]],"date-time":"2023-10-02T18:09:45Z","timestamp":1696270185000},"page":"4109-4119","source":"Crossref","is-referenced-by-count":18,"title":["Question-Aware Global-Local Video Understanding Network for Audio-Visual Question Answering"],"prefix":"10.1109","volume":"34","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-8431-5471","authenticated-orcid":false,"given":"Zailong","family":"Chen","sequence":"first","affiliation":[{"name":"School of Computing and Information Technology, University of Wollongong, Wollongong, NSW, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0961-0441","authenticated-orcid":false,"given":"Lei","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computing and Information Technology, University of Wollongong, Wollongong, NSW, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5397-9115","authenticated-orcid":false,"given":"Peng","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China"}]},{"given":"Peng","family":"Gao","sequence":"additional","affiliation":[{"name":"Institute of Computer Science, Beijing Normal University&#x2013;Hong Kong Baptist University United International College, Zhuhai, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.2995959"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3051277"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3229081"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3212463"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/tcsvt.2023.3264524"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3010650"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1606"},{"key":"ref9","first-page":"1","article-title":"Audio Visual Scene-aware Dialog (AVSD) track for natural language generation in DSTC7","volume-title":"Proc. AAAI Workshop","volume":"2","author":"Alamri"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3078368"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746481"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00204"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01852"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548291"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00228"},{"key":"ref16","article-title":"VALOR: Vision-Audio-Language Omni-peRception pretraining model and dataset","author":"Chen","year":"2023","journal-title":"arXiv:2304.08345"},{"key":"ref17","article-title":"Audiovisual SlowFast networks for video recognition","author":"Xiao","year":"2020","journal-title":"arXiv:2001.08740"},{"key":"ref18","first-page":"14200","article-title":"Attention bottlenecks for multimodal fusion","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Nagrani"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01939"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-1312"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.3301371"},{"key":"ref25","article-title":"A joint cross-attention model for audio-visual fusion in dimensional emotion recognition","author":"Rajasekar","year":"2022","journal-title":"arXiv:2203.14779"},{"key":"ref26","article-title":"Representation learning with contrastive predictive coding","author":"van den Oord","year":"2018","journal-title":"arXiv:1807.03748"},{"key":"ref27","first-page":"1","article-title":"Hierarchical question-image co-attention for visual question answering","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"29","author":"Lu"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00644"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018658"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00210"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00999"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01283"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3048440"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6767"}],"container-title":["IEEE Transactions on Circuits and Systems for Video Technology"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/76\/10527423\/10268453.pdf?arnumber=10268453","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,5,20]],"date-time":"2024-05-20T17:34:12Z","timestamp":1716226452000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10268453\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5]]},"references-count":34,"journal-issue":{"issue":"5"},"URL":"https:\/\/doi.org\/10.1109\/tcsvt.2023.3318220","relation":{},"ISSN":["1051-8215","1558-2205"],"issn-type":[{"value":"1051-8215","type":"print"},{"value":"1558-2205","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,5]]}}}