{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,10]],"date-time":"2026-06-10T16:33:36Z","timestamp":1781109216398,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62441225, 61972192, 62172208, 61906085"],"award-info":[{"award-number":["62441225, 61972192, 62172208, 61906085"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["No.14380001"],"award-info":[{"award-number":["No.14380001"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3761997","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:51Z","timestamp":1761377211000},"page":"13784-13790","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Hierarchical Vision-Language Reasoning for Multimodal Multiple-Choice Question Answering"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-6056-3515","authenticated-orcid":false,"given":"Ao","family":"Zhou","sequence":"first","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8206-8889","authenticated-orcid":false,"given":"Zebo","family":"Gu","sequence":"additional","affiliation":[{"name":"Chongqing University of Posts and Telecommunications, Chongqing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6606-1732","authenticated-orcid":false,"given":"Tenghao","family":"Sun","sequence":"additional","affiliation":[{"name":"Chongqing University of Posts and Telecommunications, Chongqing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0494-2473","authenticated-orcid":false,"given":"Jiawen","family":"Chen","sequence":"additional","affiliation":[{"name":"Chongqing University of Posts and Telecommunications, Chongqing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1679-0174","authenticated-orcid":false,"given":"Mingsheng","family":"Tu","sequence":"additional","affiliation":[{"name":"Chongqing University of Posts and Telecommunications, Chongqing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8486-2614","authenticated-orcid":false,"given":"Zifeng","family":"Cheng","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9497-6244","authenticated-orcid":false,"given":"Yafeng","family":"Yin","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5243-4992","authenticated-orcid":false,"given":"Zhiwei","family":"Jiang","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1112-790X","authenticated-orcid":false,"given":"Qing","family":"Gu","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems Vol. 35 (2022) 23716-23736."},{"key":"e_1_3_2_1_3_1","first-page":"2425","article-title":"Vqa: Visual question answering","author":"Antol Stanislaw","year":"2015","unstructured":"Stanislaw Antol, Aishwarya Agrawal, Jiasen Lu, Margaret Mitchell, Dhruv Batra, C Lawrence Zitnick, and Devi Parikh. 2015. Vqa: Visual question answering. In ICCV. 2425-2433.","journal-title":"ICCV."},{"key":"e_1_3_2_1_4_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_5_1","first-page":"2206","article-title":"Improving language models by retrieving from trillions of tokens","author":"Borgeaud Sebastian","year":"2022","unstructured":"Sebastian Borgeaud, Arthur Mensch, Jordan Hoffmann, Trevor Cai, Eliza Rutherford, Katie Millican, George Bm Van Den Driessche, Jean-Baptiste Lespiau, Bogdan Damoc, Aidan Clark, et al., 2022. Improving language models by retrieving from trillions of tokens. In ICML. 2206-2240.","journal-title":"ICML."},{"key":"e_1_3_2_1_6_1","first-page":"13590","article-title":"The Revolution of Multimodal Large Language Models: A Survey","author":"Caffagni Davide","year":"2024","unstructured":"Davide Caffagni, Federico Cocchi, Luca Barsellotti, Nicholas Moratelli, Sara Sarto, Lorenzo Baraldi, Marcella Cornia, and Rita Cucchiara. 2024. The Revolution of Multimodal Large Language Models: A Survey. In ACL Findings. 13590-13618.","journal-title":"ACL Findings."},{"key":"e_1_3_2_1_7_1","first-page":"24185","article-title":"Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, et al., 2024. Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In CVPR. 24185-24198.","journal-title":"CVPR."},{"key":"e_1_3_2_1_8_1","first-page":"3569","article-title":"Caption-aware medical VQA via semantic focusing and progressive cross-modality comprehension","author":"Cong Fuze","year":"2022","unstructured":"Fuze Cong, Shibiao Xu, Li Guo, and Yinbing Tian. 2022. Caption-aware medical VQA via semantic focusing and progressive cross-modality comprehension. In MM. 3569-3577.","journal-title":"MM."},{"key":"e_1_3_2_1_9_1","volume-title":"Qlora: Efficient finetuning of quantized llms. Advances in neural information processing systems","author":"Dettmers Tim","year":"2023","unstructured":"Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. 2023. Qlora: Efficient finetuning of quantized llms. Advances in neural information processing systems, Vol. 36 (2023), 10088-10115."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00381"},{"key":"e_1_3_2_1_11_1","unstructured":"Manuel Faysse Hugues Sibille Tony Wu Bilel Omrani Gautier Viaud C\u00e9line Hudelot and Pierre Colombo. 2025. ColPali: Efficient Document Retrieval with Vision Language Models. In ICLR."},{"key":"e_1_3_2_1_12_1","volume-title":"Retrieval-augmented generation for large language models: A survey. arXiv preprint arXiv:2312.10997","author":"Gao Yunfan","year":"2023","unstructured":"Yunfan Gao, Yun Xiong, Xinyu Gao, Kangxiang Jia, Jinliu Pan, Yuxi Bi, Yixin Dai, Jiawei Sun, Haofen Wang, and Haofen Wang. 2023. Retrieval-augmented generation for large language models: A survey. arXiv preprint arXiv:2312.10997, Vol. 2, 1 (2023)."},{"key":"e_1_3_2_1_13_1","first-page":"6904","article-title":"Making the v in vqa matter: Elevating the role of image understanding in visual question answering","author":"Goyal Yash","year":"2017","unstructured":"Yash Goyal, Tejas Khot, Douglas Summers-Stay, Dhruv Batra, and Devi Parikh. 2017. Making the v in vqa matter: Elevating the role of image understanding in visual question answering. In CVPR. 6904-6913.","journal-title":"CVPR."},{"key":"e_1_3_2_1_14_1","first-page":"10867","article-title":"From images to textual prompts: Zero-shot visual question answering with frozen large language models","author":"Guo Jiaxian","year":"2023","unstructured":"Jiaxian Guo, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Boyang Li, Dacheng Tao, and Steven Hoi. 2023. From images to textual prompts: Zero-shot visual question answering with frozen large language models. In CVPR. 10867-10877.","journal-title":"CVPR."},{"key":"e_1_3_2_1_15_1","first-page":"3929","article-title":"Retrieval augmented language model pre-training","author":"Guu Kelvin","year":"2020","unstructured":"Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat, and Mingwei Chang. 2020. Retrieval augmented language model pre-training. In ICML. 3929-3938.","journal-title":"ICML."},{"key":"e_1_3_2_1_16_1","unstructured":"Wenyi Hong Wenmeng Yu Xiaotao Gu Guo Wang Guobing Gan Haomiao Tang Jiale Cheng Ji Qi Junhui Ji Lihang Pan et al. 2025. GLM-4.1 V-Thinking: Towards Versatile Multimodal Reasoning with Scalable Reinforcement Learning. arXiv preprint arXiv:2507.01006 (2025)."},{"key":"e_1_3_2_1_17_1","first-page":"6929","article-title":"mplug-paperowl: Scientific diagram analysis with the multimodal large language model","author":"Hu Anwen","year":"2024","unstructured":"Anwen Hu, Yaya Shi, Haiyang Xu, Jiabo Ye, Qinghao Ye, Ming Yan, Chenliang Li, Qi Qian, Ji Zhang, and Fei Huang. 2024a. mplug-paperowl: Scientific diagram analysis with the multimodal large language model. In ACM MM. 6929-6938.","journal-title":"ACM MM."},{"key":"e_1_3_2_1_18_1","first-page":"3096","article-title":"mPLUG-DocOwl 1.5","author":"Hu Anwen","year":"2024","unstructured":"Anwen Hu, Haiyang Xu, Jiabo Ye, Ming Yan, Liang Zhang, Bo Zhang, Ji Zhang, Qin Jin, Fei Huang, and Jingren Zhou. 2024b. mPLUG-DocOwl 1.5: Unified Structure Learning for OCR-free Document Understanding. In EMNLP Findings. 3096-3120.","journal-title":"Unified Structure Learning for OCR-free Document Understanding. In EMNLP Findings."},{"key":"e_1_3_2_1_19_1","first-page":"3","article-title":"Lora: Low-rank adaptation of large language models","volume":"1","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al., 2022. Lora: Low-rank adaptation of large language models. ICLR, Vol. 1, 2 (2022), 3.","journal-title":"ICLR"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2024.110399"},{"key":"e_1_3_2_1_21_1","first-page":"1","article-title":"Atlas: Few-shot learning with retrieval augmented language models","volume":"24","author":"Izacard Gautier","year":"2023","unstructured":"Gautier Izacard, Patrick Lewis, Maria Lomeli, Lucas Hosseini, Fabio Petroni, Timo Schick, Jane Dwivedi-Yu, Armand Joulin, Sebastian Riedel, and Edouard Grave. 2023. Atlas: Few-shot learning with retrieval augmented language models. Journal of Machine Learning Research, Vol. 24, 251 (2023), 1-43.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_22_1","first-page":"6769","article-title":"Dense Passage Retrieval for Open-Domain Question Answering","author":"Karpukhin Vladimir","year":"2020","unstructured":"Vladimir Karpukhin, Barlas Oguz, Sewon Min, Patrick SH Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. 2020. Dense Passage Retrieval for Open-Domain Question Answering.. In EMNLP. 6769-6781.","journal-title":"EMNLP."},{"key":"e_1_3_2_1_23_1","volume-title":"Generalization through memorization: Nearest neighbor language models. arXiv preprint arXiv:1911.00172","author":"Khandelwal Urvashi","year":"2019","unstructured":"Urvashi Khandelwal, Omer Levy, Dan Jurafsky, Luke Zettlemoyer, and Mike Lewis. 2019. Generalization through memorization: Nearest neighbor language models. arXiv preprint arXiv:1911.00172 (2019)."},{"key":"e_1_3_2_1_24_1","first-page":"39","article-title":"Colbert: Efficient and effective passage search via contextualized late interaction over bert","author":"Khattab Omar","year":"2020","unstructured":"Omar Khattab and Matei Zaharia. 2020. Colbert: Efficient and effective passage search via contextualized late interaction over bert. In SIGIR. 39-48.","journal-title":"SIGIR."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3711680"},{"key":"e_1_3_2_1_26_1","first-page":"7871","article-title":"BART","author":"Lewis Mike","year":"2020","unstructured":"Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Veselin Stoyanov, and Luke Zettlemoyer. 2020. BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension. In ACL. 7871-7880.","journal-title":"In ACL."},{"key":"e_1_3_2_1_27_1","first-page":"19730","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In ICML. 19730-19742.","journal-title":"ICML."},{"key":"e_1_3_2_1_28_1","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML. 12888-12900.","journal-title":"ICML."},{"key":"e_1_3_2_1_29_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 34892-34916."},{"key":"e_1_3_2_1_30_1","volume-title":"Noah Constant, Ji Ma, Keith B Hall, Daniel Cer, and Yinfei Yang.","author":"Ni Jianmo","year":"2021","unstructured":"Jianmo Ni, Gustavo Hernandez Abrego, Noah Constant, Ji Ma, Keith B Hall, Daniel Cer, and Yinfei Yang. 2021. Sentence-t5: Scalable sentence encoders from pre-trained text-to-text models. arXiv preprint arXiv:2108.08877 (2021)."},{"key":"e_1_3_2_1_31_1","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In ICML. 8748-8763.","journal-title":"ICML."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00605"},{"key":"e_1_3_2_1_33_1","volume-title":"NeurIPS","volume":"28","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. In NeurIPS, Vol. 28."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1561\/1500000019"},{"key":"e_1_3_2_1_35_1","first-page":"10684","article-title":"High-resolution image synthesis with latent diffusion models","author":"Rombach Robin","year":"2022","unstructured":"Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, and Bj\u00f6rn Ommer. 2022. High-resolution image synthesis with latent diffusion models. In CVPR. 10684-10695.","journal-title":"CVPR."},{"key":"e_1_3_2_1_36_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_37_1","volume-title":"Ryan Burnell, Libin Bai, Anmol Gulati, Garrett Tanzer, Damien Vincent, Zhufeng Pan, Shibo Wang, et al.","author":"Team Gemini","year":"2024","unstructured":"Gemini Team, Petko Georgiev, Ving Ian Lei, Ryan Burnell, Libin Bai, Anmol Gulati, Garrett Tanzer, Damien Vincent, Zhufeng Pan, Shibo Wang, et al., 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530 (2024)."},{"key":"e_1_3_2_1_38_1","unstructured":"Kimi Team Angang Du Bohong Yin Bowei Xing Bowen Qu Bowen Wang Cheng Chen Chenlin Zhang Chenzhuang Du Chu Wei et al. 2025. Kimi-vl technical report. arXiv preprint arXiv:2504.07491 (2025)."},{"key":"e_1_3_2_1_39_1","first-page":"19528","article-title":"Document understanding dataset and evaluation (dude)","author":"Landeghem Jordy Van","year":"2023","unstructured":"Jordy Van Landeghem, Rub\u00e8n Tito, \u0141ukasz Borchmann, Micha\u0142 Pietruszka, Pawel Joziak, Rafal Powalski, Dawid Jurkiewicz, Micka\u00ebl Coustaty, Bertrand Anckaert, Ernest Valveny, et al., 2023. Document understanding dataset and evaluation (dude). In ICCV. 19528-19540.","journal-title":"ICCV."},{"key":"e_1_3_2_1_40_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_41_1","first-page":"24824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume":"35","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al., 2022. Chain-of-thought prompting elicits reasoning in large language models. In NeurIPS, Vol. 35. 24824-24837.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1093\/nsr\/nwae403"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3369699"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3761997","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:07:18Z","timestamp":1765339638000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3761997"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":43,"alternative-id":["10.1145\/3746027.3761997","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3761997","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}