{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T16:52:07Z","timestamp":1781542327861,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810845","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"1006-1014","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["A Unified Object-Centric Spatio-Temporal Graph Reasoning Framework for Audio-Visual Question Answering"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8044-5912","authenticated-orcid":false,"given":"Feifei","family":"Xu","sequence":"first","affiliation":[{"name":"Shanghai University of Electric Power, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-6928-4141","authenticated-orcid":false,"given":"Wenjing","family":"Zhu","sequence":"additional","affiliation":[{"name":"Shanghai University of Electric Power, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5956-9143","authenticated-orcid":false,"given":"Dongyang","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai University of Electric Power, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1275-3454","authenticated-orcid":false,"given":"Puzhe","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai University of Electric Power, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9994-1382","authenticated-orcid":false,"given":"Luobin","family":"Huang","sequence":"additional","affiliation":[{"name":"Shanghai University of Electric Power, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8740-2374","authenticated-orcid":false,"given":"Yu","family":"Xie","sequence":"additional","affiliation":[{"name":"Shanghai University of Electric Power, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9259-6345","authenticated-orcid":false,"given":"Zirui","family":"Xu","sequence":"additional","affiliation":[{"name":"Shanghai University of Electric Power, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Kenza Amara Lukas Klein Carsten L\u00fcth Paul J\u00e4ger Hendrik Strobelt and Mennatallah El-Assady. 2024. Why context matters in VQA and Reasoning: Semantic interventions for VLM input modalities. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.01690 (2024)."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"crossref","unstructured":"Dongsheng Chen Chaofan Tao Lu Hou Lifeng Shang Xin Jiang and Qun Liu. 2022. Litevl: Efficient video-language learning with enhanced spatial-temporal modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.11929 (2022).","DOI":"10.18653\/v1\/2022.emnlp-main.545"},{"key":"e_1_3_3_1_4_2","unstructured":"Sihan Chen Xingjian He Longteng Guo Xinxin Zhu Weining Wang Jinhui Tang and Jing Liu. 2023. Valor: Vision-audio-language omni-perception pretraining model and dataset. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.08345 (2023)."},{"key":"e_1_3_3_1_5_2","first-page":"4640","volume-title":"Proceedings of the IEEE\/CVF international conference on computer vision","author":"Oliveira\u00a0Souza Bruno\u00a0Cesar de","year":"2023","unstructured":"Bruno\u00a0Cesar de Oliveira\u00a0Souza, Marius Aasan, Helio Pedrini, and Adin\u00a0Ramirez Rivera. 2023. Selfgraphvqa: a self-supervised graph neural network for scene-based question answering. In Proceedings of the IEEE\/CVF international conference on computer vision. 4640\u20134645."},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"Haoyi Duan Yan Xia Zhou Mingze Li Tang Jieming Zhu and Zhou Zhao. 2023. Cross-modal prompts: Adapting large pre-trained models for audio-visual downstream tasks. Advances in Neural Information Processing Systems 36 (2023) 56075\u201356094.","DOI":"10.52202\/075280-2445"},{"key":"e_1_3_3_1_7_2","unstructured":"Hao Fei Shengqiong Wu Wei Ji Hanwang Zhang Meishan Zhang Mong-Li Lee and Wynne Hsu. 2024. Video-of-thought: Step-by-step video reasoning from perception to cognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.03230 (2024)."},{"key":"e_1_3_3_1_8_2","unstructured":"Zijian Fu Changsheng Lv Mengshi Qi and Huadong Ma. 2025. Multi-Modal Scene Graph with Kolmogorov-Arnold Experts for Audio-Visual Question Answering. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2511.23304 (2025)."},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01001"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6737"},{"key":"e_1_3_3_1_13_2","first-page":"1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshop (CVPRW)","author":"Huang Ziru","year":"2024","unstructured":"Ziru Huang, Jia Li, Wenjie Zhao, Yunhui Guo, and Yapeng Tian. 2024. AV-Mamba: Cross-Modality Selective State Space Models for Audio-Visual Question Answering. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshop (CVPRW). 1\u20134."},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"crossref","unstructured":"Yuanyuan Jiang and Jianqin Yin. 2025. CLIP-Powered TASS: Target-Aware Single-Stream Network for Audio-Visual Question Answering. International Journal of Computer Vision 133 5 (2025) 2581\u20132598.","DOI":"10.1007\/s11263-024-02289-z"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01277"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i11.26527"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680803"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612293"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01852"},{"key":"e_1_3_3_1_20_2","unstructured":"Guangyao Li Yixin Xu and Di Hu. 2023. Multi-scale attention for audio question answering. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.17993 (2023)."},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i4.28116"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i5.32538"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00228"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00442"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Jie Ma Min Hu Pinghui Wang Wangchun Sun Lingyun Song Hongbin Pei Jun Liu and Youtian Du. 2024. Look listen and answer: Overcoming biases for audio-visual question answering. Advances in Neural Information Processing Systems 37 (2024) 9507\u20139531.","DOI":"10.52202\/079017-0302"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00709"},{"key":"e_1_3_3_1_27_2","first-page":"42","volume-title":"European Conference on Computer Vision","author":"Park Kyu\u00a0Ri","year":"2024","unstructured":"Kyu\u00a0Ri Park, Hong\u00a0Joo Lee, and Jung\u00a0Uk Kim. 2024. Learning Trimodal Relation for Audio-Visual Question Answering with Missing Modality. In European Conference on Computer Vision. Springer, 42\u201359."},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446292"},{"key":"e_1_3_3_1_29_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."},{"key":"e_1_3_3_1_30_2","unstructured":"Karan Reddy and Mayukha Pal. 2025. Contextual Graph Transformer: A Small Language Model for Enhanced Engineering Document Information Extraction. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2508.02532 (2025)."},{"key":"e_1_3_3_1_31_2","unstructured":"Shaoqing Ren Kaiming He Ross Girshick and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28 (2015)."},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01283"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01020"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01020"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW69036.2025.00447"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"crossref","unstructured":"Yuchong Sun Hongwei Xue Ruihua Song Bei Liu Huan Yang and Jianlong Fu. 2022. Long-form video-language pre-training with multimodal temporal contrastive learning. Advances in neural information processing systems 35 (2022) 38032\u201338045.","DOI":"10.52202\/068431-2756"},{"key":"e_1_3_3_1_37_2","unstructured":"Yunlong Tang Daiki Shimada Jing Bi and Chenliang Xu. 2024. Avicuna: Audio-visual llm with interleaver and context-boundary alignment for temporal referential dialogue. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.16276 2 (2024)."},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00190"},{"key":"e_1_3_3_1_39_2","unstructured":"Yanan Wang Shuichiro Haruta Donghuo Zeng Julio Vizcarra and Mori Kurokawa. 2024. Multi-object event graph representation learning for video question answering. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.07747 (2024)."},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01973"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548291"},{"key":"e_1_3_3_1_42_2","unstructured":"Siwei Yang Bingchen Zhao and Cihang Xie. 2024. AQA-Bench: An Interactive Benchmark for Evaluating LLMs\u2019 Sequential Reasoning Ability. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.09404 (2024)."},{"key":"e_1_3_3_1_43_2","unstructured":"Zhe Yang Wenrui Li and Guanghui Cheng. 2024. Shmamba: Structured hyperbolic state space model for audio-visual question answering. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.09833 (2024)."},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00204"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"crossref","unstructured":"Sheng Zhou Dan Guo Jia Li Xun Yang and Meng Wang. 2023. Exploring sparse spatial relation in graph inference for text-based VQA. IEEE Transactions on Image Processing 32 (2023) 5060\u20135074.","DOI":"10.1109\/TIP.2023.3310332"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:54:06Z","timestamp":1781538846000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810845"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":44,"alternative-id":["10.1145\/3805622.3810845","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810845","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}