{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:04:16Z","timestamp":1765343056364,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":85,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758266","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:37:21Z","timestamp":1761377841000},"page":"13133-13140","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Argus Inspection: Do Multimodal Large Language Models Possess the Eye of Panoptes?"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-1511-5149","authenticated-orcid":false,"given":"Yang","family":"Yao","sequence":"first","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China and The University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3031-8223","authenticated-orcid":false,"given":"Lingyu","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0213-2224","authenticated-orcid":false,"given":"Jiaxin","family":"Song","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8836-7313","authenticated-orcid":false,"given":"Chiyu","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2265-7159","authenticated-orcid":false,"given":"Zhenqi","family":"He","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9519-5063","authenticated-orcid":false,"given":"Yixu","family":"Wang","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9531-6662","authenticated-orcid":false,"given":"Xin","family":"Wang","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1546-8196","authenticated-orcid":false,"given":"Tianle","family":"Gu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3102-6425","authenticated-orcid":false,"given":"Jie","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7069-4728","authenticated-orcid":false,"given":"Yan","family":"Teng","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4115-1398","authenticated-orcid":false,"given":"Yingchun","family":"Wang","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Yury Zemlyanskiy, Federico Lebr\u00f3n, and Sumit Sanghai.","author":"Ainslie Joshua","year":"2023","unstructured":"Joshua Ainslie, James Lee-Thorp, Michiel De Jong, Yury Zemlyanskiy, Federico Lebr\u00f3n, and Sumit Sanghai. 2023. Gqa: Training generalized multi-query transformer models from multi-head checkpoints. arXiv preprint arXiv:2305.13245 (2023)."},{"key":"e_1_3_2_1_2_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems Vol. 35 (2022) 23716-23736."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.12"},{"key":"e_1_3_2_1_4_1","unstructured":"Anthropic. 2025. Claude 3.7 sonnet. https:\/\/www.anthropic.com\/news\/claude-3-7-sonnet."},{"key":"e_1_3_2_1_5_1","volume-title":"Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"Mindbench: A comprehensive benchmark for mind map structure recognition and analysis. arXiv preprint arXiv:2407.02842","author":"Chen Lei","year":"2024","unstructured":"Lei Chen, Feng Yan, Yujie Zhong, Shaoxiang Chen, Zequn Jie, and Lin Ma. 2024b. Mindbench: A comprehensive benchmark for mind map structure recognition and analysis. arXiv preprint arXiv:2407.02842 (2024)."},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 24185-24198","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, et al., 2024a. Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 24185-24198."},{"key":"e_1_3_2_1_8_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi.","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arXiv:2305.06500 [cs.CV]"},{"key":"e_1_3_2_1_9_1","volume-title":"VLMEvalKit: An Open-Source Toolkit for Evaluating Large Multi-Modality Models. arXiv preprint arXiv:2407.11691","author":"Duan Haodong","year":"2024","unstructured":"Haodong Duan, Junming Yang, Yuxuan Qiao, Xinyu Fang, Lin Chen, Yuan Liu, Xiaoyi Dong, Yuhang Zang, Pan Zhang, Jiaqi Wang, Dahua Lin, and Kai Chen. 2024. VLMEvalKit: An Open-Source Toolkit for Evaluating Large Multi-Modality Models. arXiv preprint arXiv:2407.11691 (2024)."},{"key":"e_1_3_2_1_10_1","volume-title":"On pre-training of multimodal language models customized for chart understanding. arXiv preprint arXiv:2407.14506","author":"Fan Wan-Cyuan","year":"2024","unstructured":"Wan-Cyuan Fan, Yen-Chun Chen, Mengchen Liu, Lu Yuan, and Leonid Sigal. 2024. On pre-training of multimodal language models customized for chart understanding. arXiv preprint arXiv:2407.14506 (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"Video-of-thought: Step-by-step video reasoning from perception to cognition. arXiv preprint arXiv:2501.03230","author":"Fei Hao","year":"2024","unstructured":"Hao Fei, Shengqiong Wu, Wei Ji, Hanwang Zhang, Meishan Zhang, Mong-Li Lee, and Wynne Hsu. 2024. Video-of-thought: Step-by-step video reasoning from perception to cognition. arXiv preprint arXiv:2501.03230 (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"Interleaved-modal chain-of-thought. arXiv preprint arXiv:2411.19488","author":"Gao Jun","year":"2024","unstructured":"Jun Gao, Yongqi Li, Ziqiang Cao, and Wenjie Li. 2024. Interleaved-modal chain-of-thought. arXiv preprint arXiv:2411.19488 (2024)."},{"key":"e_1_3_2_1_13_1","unstructured":"Google. 2025. Gemini 2.5 pro model card. https:\/\/storage.googleapis.com\/model-cards\/documents\/gemini-2.5-pro-preview.pdf."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0232"},{"volume-title":"MMWorld: Towards Multi-discipline Multi-faceted World Model Evaluation in Videos. In The Thirteenth International Conference on Learning Representations.","author":"He Xuehai","key":"e_1_3_2_1_15_1","unstructured":"Xuehai He, Weixi Feng, Kaizhi Zheng, Yujie Lu, Wanrong Zhu, Jiachen Li, Yue Fan, Jianfeng Wang, Linjie Li, Zhengyuan Yang, et al., [n.d.]. MMWorld: Towards Multi-discipline Multi-faceted World Model Evaluation in Videos. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_16_1","volume-title":"Cmmu: A benchmark for chinese multi-modal multi-type question understanding and reasoning. arXiv preprint arXiv:2401.14011","author":"He Zheqi","year":"2024","unstructured":"Zheqi He, Xinya Wu, Pengfei Zhou, Richeng Xuan, Guang Liu, Xi Yang, Qiannan Zhu, and Hua Huang. 2024. Cmmu: A benchmark for chinese multi-modal multi-type question understanding and reasoning. arXiv preprint arXiv:2401.14011 (2024)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10579-020-09517-1"},{"key":"e_1_3_2_1_18_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. Gpt-4o system card. arXiv preprint arXiv:2410.21276 (2024)."},{"key":"e_1_3_2_1_19_1","unstructured":"Aaron Jaech Adam Kalai Adam Lerer Adam Richardson Ahmed El-Kishky Aiden Low Alec Helyar Aleksander Madry Alex Beutel Alex Carney et al. 2024. Openai o1 system card. arXiv preprint arXiv:2412.16720 (2024)."},{"key":"e_1_3_2_1_20_1","volume-title":"International conference on machine learning. PMLR, 4904-4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning. PMLR, 4904-4916."},{"key":"e_1_3_2_1_21_1","unstructured":"Yizhang Jin Jian Li Yexin Liu Tianjun Gu Kai Wu Zhengkai Jiang Muyang He Bo Zhao Xin Tan Zhenye Gan et al. 2024. Efficient multimodal large language models: A survey. arXiv preprint arXiv:2405.10739 (2024)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2017.06.005"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00592"},{"key":"e_1_3_2_1_24_1","volume-title":"What's'' up'' with vision-language models? Investigating their struggle with spatial reasoning. arXiv preprint arXiv:2310.19785","author":"Kamath Amita","year":"2023","unstructured":"Amita Kamath, Jack Hessel, and Kai-Wei Chang. 2023. What's'' up'' with vision-language models? Investigating their struggle with spatial reasoning. arXiv preprint arXiv:2310.19785 (2023)."},{"key":"e_1_3_2_1_25_1","volume-title":"Letitia Parcalabescu, Iacer Calixto, Anette Frank, Albert Gatt, Aykut Erdem, et al.","author":"Kesen Ilker","year":"2023","unstructured":"Ilker Kesen, Andrea Pedrotti, Mustafa Dogan, Michele Cafagna, Emre Can Acikgoz, Letitia Parcalabescu, Iacer Calixto, Anette Frank, Albert Gatt, Aykut Erdem, et al., 2023. Vilma: A zero-shot benchmark for linguistic and temporal grounding in video-language models. arXiv preprint arXiv:2311.07022 (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"Tablevqa-bench: A visual question answering benchmark on multiple table domains. arXiv preprint arXiv:2404.19205","author":"Kim Yoonsik","year":"2024","unstructured":"Yoonsik Kim, Moonbin Yim, and Ka Yeon Song. 2024. Tablevqa-bench: A visual question answering benchmark on multiple table domains. arXiv preprint arXiv:2404.19205 (2024)."},{"key":"e_1_3_2_1_27_1","volume-title":"Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa.","author":"Kojima Takeshi","year":"2022","unstructured":"Takeshi Kojima, Shixiang Shane Gu, Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa. 2022. Large language models are zero-shot reasoners. Advances in neural information processing systems, Vol. 35 (2022), 22199-22213."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Ranjay Krishna Yuke Zhu Oliver Groth Justin Johnson Kenji Hata Joshua Kravitz Stephanie Chen Yannis Kalantidis Li-Jia Li David A Shamma et al. 2017. Visual genome: Connecting language and vision using crowdsourced dense image annotations. International journal of computer vision Vol. 123 (2017) 32-73.","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_29_1","volume-title":"Imagine while Reasoning in Space: Multimodal Visualization-of-Thought. arXiv preprint arXiv:2501.07542","author":"Li Chengzu","year":"2025","unstructured":"Chengzu Li, Wenshan Wu, Huanyu Zhang, Yan Xia, Shaoguang Mao, Li Dong, Ivan Vuli\u0107, and Furu Wei. 2025b. Imagine while Reasoning in Space: Multimodal Visualization-of-Thought. arXiv preprint arXiv:2501.07542 (2025)."},{"key":"e_1_3_2_1_30_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_31_1","volume-title":"International conference on machine learning. PMLR, 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888-12900."},{"key":"e_1_3_2_1_32_1","volume-title":"A Survey on Multimodal Benchmarks: In the Era of Large AI Models. arXiv preprint arXiv:2409.18142","author":"Li Lin","year":"2024","unstructured":"Lin Li, Guikun Chen, Hanrong Shi, Jun Xiao, and Long Chen. 2024b. A Survey on Multimodal Benchmarks: In the Era of Large AI Models. arXiv preprint arXiv:2409.18142 (2024)."},{"key":"e_1_3_2_1_33_1","unstructured":"Ming Li Keyu Chen Ziqian Bi Ming Liu Benji Peng Qian Niu Junyu Liu Jinlang Wang Sen Zhang Xuanhe Pan et al. 2024a. Surveying the mllm landscape: A meta-review of current surveys. arXiv preprint arXiv:2409.18991 (2024)."},{"key":"e_1_3_2_1_34_1","volume-title":"Scigraphqa: A large-scale synthetic multi-turn question-answering dataset for scientific graphs. arXiv preprint arXiv:2308.03349","author":"Li Shengzhi","year":"2023","unstructured":"Shengzhi Li and Nima Tajbakhsh. 2023. Scigraphqa: A large-scale synthetic multi-turn question-answering dataset for scientific graphs. arXiv preprint arXiv:2308.03349 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"A Survey on Large Multimodal Reasoning Models. arXiv preprint arXiv:2505.04921","author":"Li Yunxin","year":"2025","unstructured":"Yunxin Li, Zhenyu Liu, Zitao Li, Xuanyu Zhang, Zhenran Xu, Xinyu Chen, Haoyuan Shi, Shenyuan Jiang, Xintong Wang, Jifang Wang, Shouzheng Huang, Xinping Zhao, Borui Jiang, Lanqing Hong, Longyue Wang, Zhuotao Tian, Baoxing Huai, Wenhan Luo, Weihua Luo, Zheng Zhang, Baotian Hu, and Min Zhang. 2025a. Perception, Reason, Think, and Plan: A Survey on Large Multimodal Reasoning Models. arXiv preprint arXiv:2505.04921 (2025)."},{"key":"e_1_3_2_1_36_1","volume-title":"Look before you decide: Prompting active deduction of mllms for assumptive reasoning. arXiv preprint arXiv:2404.12966","author":"Li Yian","year":"2024","unstructured":"Yian Li, Wentao Tian, Yang Jiao, Jingjing Chen, Tianwen Qian, Bin Zhu, Na Zhao, and Yu-Gang Jiang. 2024c. Look before you decide: Prompting active deduction of mllms for assumptive reasoning. arXiv preprint arXiv:2404.12966 (2024)."},{"key":"e_1_3_2_1_37_1","volume-title":"Sungyoung Ji, Byungju Lee, Xifeng Yan, et al.","author":"Li Zekun","year":"2024","unstructured":"Zekun Li, Xianjun Yang, Kyuri Choi, Wanrong Zhu, Ryan Hsieh, HyeonJung Kim, Jin Hyuk Lim, Sungyoung Ji, Byungju Lee, Xifeng Yan, et al., 2024d. Mmsci: A dataset for graduate-level multi-discipline multimodal scientific understanding. arXiv preprint arXiv:2407.04903 (2024)."},{"key":"e_1_3_2_1_38_1","volume-title":"Scemqa: A scientific college entrance level multimodal question answering benchmark. arXiv preprint arXiv:2402.05138","author":"Liang Zhenwen","year":"2024","unstructured":"Zhenwen Liang, Kehan Guo, Gang Liu, Taicheng Guo, Yujun Zhou, Tianyu Yang, Jiajun Jiao, Renjie Pi, Jipeng Zhang, and Xiangliang Zhang. 2024. Scemqa: A scientific college entrance level multimodal question answering benchmark. arXiv preprint arXiv:2402.05138 (2024)."},{"key":"e_1_3_2_1_39_1","volume-title":"Moe-llava: Mixture of experts for large vision-language models. arXiv preprint arXiv:2401.15947","author":"Lin Bin","year":"2024","unstructured":"Bin Lin, Zhenyu Tang, Yang Ye, Jiaxi Cui, Bin Zhu, Peng Jin, Jinfa Huang, Junwu Zhang, Yatian Pang, Munan Ning, et al., 2024. Moe-llava: Mixture of experts for large vision-language models. arXiv preprint arXiv:2401.15947 (2024)."},{"key":"e_1_3_2_1_40_1","volume-title":"Investigating inference-time scaling for chain of multi-modal thought: A preliminary study. arXiv preprint arXiv:2502.11514","author":"Lin Yujie","year":"2025","unstructured":"Yujie Lin, Ante Wang, Moye Chen, Jingyao Liu, Hao Liu, Jinsong Su, and Xinyan Xiao. 2025. Investigating inference-time scaling for chain of multi-modal thought: A preliminary study. arXiv preprint arXiv:2502.11514 (2025)."},{"key":"e_1_3_2_1_41_1","volume-title":"Mmc: Advancing multimodal chart understanding with large-scale instruction tuning. arXiv preprint arXiv:2311.10774","author":"Liu Fuxiao","year":"2023","unstructured":"Fuxiao Liu, Xiaoyang Wang, Wenlin Yao, Jianshu Chen, Kaiqiang Song, Sangwoo Cho, Yaser Yacoob, and Dong Yu. 2023c. Mmc: Advancing multimodal chart understanding with large-scale instruction tuning. arXiv preprint arXiv:2311.10774 (2023)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_2_1_43_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023a. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 34892-34916."},{"key":"e_1_3_2_1_44_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023b. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 34892-34916."},{"key":"e_1_3_2_1_45_1","volume-title":"Wai Lam, Graham Neubig, Yuanzhi Li, and Xiang Yue.","author":"Liu Junpeng","year":"2024","unstructured":"Junpeng Liu, Yifan Song, Bill Yuchen Lin, Wai Lam, Graham Neubig, Yuanzhi Li, and Xiang Yue. 2024b. Visualwebbench: How far have multimodal llms evolved in web page understanding and grounding? arXiv preprint arXiv:2404.05955 (2024)."},{"key":"e_1_3_2_1_46_1","volume-title":"Query-relevant images jailbreak large multi-modal models. arXiv preprint arXiv:2311.17600","author":"Liu Xin","year":"2023","unstructured":"Xin Liu, Yichen Zhu, Yunshi Lan, Chao Yang, and Yu Qiao. 2023d. Query-relevant images jailbreak large multi-modal models. arXiv preprint arXiv:2311.17600 (2023)."},{"key":"e_1_3_2_1_47_1","volume-title":"Mathvista: Evaluating mathematical reasoning of foundation models in visual contexts. arXiv preprint arXiv:2310.02255","author":"Lu Pan","year":"2023","unstructured":"Pan Lu, Hritik Bansal, Tony Xia, Jiacheng Liu, Chunyuan Li, Hannaneh Hajishirzi, Hao Cheng, Kai-Wei Chang, Michel Galley, and Jianfeng Gao. 2023. Mathvista: Evaluating mathematical reasoning of foundation models in visual contexts. arXiv preprint arXiv:2310.02255 (2023)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00156"},{"key":"e_1_3_2_1_50_1","volume-title":"Spiqa: A dataset for multimodal question answering on scientific papers. arXiv preprint arXiv:2407.09413","author":"Pramanick Shraman","year":"2024","unstructured":"Shraman Pramanick, Rama Chellappa, and Subhashini Venugopalan. 2024. Spiqa: A dataset for multimodal question answering on scientific papers. arXiv preprint arXiv:2407.09413 (2024)."},{"key":"e_1_3_2_1_51_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00060"},{"key":"e_1_3_2_1_53_1","volume-title":"Visual chain of thought: bridging logical gaps with multimodal infillings. arXiv preprint arXiv:2305.02317","author":"Rose Daniel","year":"2023","unstructured":"Daniel Rose, Vaishnavi Himakunthala, Andy Ouyang, Ryan He, Alex Mei, Yujie Lu, Michael Saxon, Chinmay Sonar, Diba Mirza, and William Yang Wang. 2023. Visual chain of thought: bridging logical gaps with multimodal infillings. arXiv preprint arXiv:2305.02317 (2023)."},{"key":"e_1_3_2_1_54_1","unstructured":"Christoph Schuhmann Romain Beaumont Richard Vencu Cade Gordon Ross Wightman Mehdi Cherti Theo Coombes Aarush Katta Clayton Mullis Mitchell Wortsman et al. 2022. Laion-5b: An open large-scale dataset for training next generation image-text models. Advances in neural information processing systems Vol. 35 (2022) 25278-25294."},{"key":"e_1_3_2_1_55_1","volume-title":"Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114","author":"Schuhmann Christoph","year":"2021","unstructured":"Christoph Schuhmann, Richard Vencu, Romain Beaumont, Robert Kaczmarczyk, Clayton Mullis, Aarush Katta, Theo Coombes, Jenia Jitsev, and Aran Komatsuzaki. 2021. Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)."},{"key":"e_1_3_2_1_56_1","first-page":"8612","article-title":"Visual cot: Advancing multi-modal language models with a comprehensive dataset and benchmark for chain-of-thought reasoning","volume":"37","author":"Shao Hao","year":"2024","unstructured":"Hao Shao, Shengju Qian, Han Xiao, Guanglu Song, Zhuofan Zong, Letian Wang, Yu Liu, and Hongsheng Li. 2024. Visual cot: Advancing multi-modal language models with a comprehensive dataset and benchmark for chain-of-thought reasoning. Advances in Neural Information Processing Systems, Vol. 37 (2024), 8612-8642.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_57_1","volume-title":"Tinylvlm-ehub: Towards comprehensive and efficient evaluation for large vision-language models","author":"Shao Wenqi","year":"2025","unstructured":"Wenqi Shao, Meng Lei, Yutao Hu, Peng Gao, Peng Xu, Kaipeng Zhang, Fanqing Meng, Siyuan Huang, Hongsheng Li, and Yu Qiao. 2025. Tinylvlm-ehub: Towards comprehensive and efficient evaluation for large vision-language models. IEEE Transactions on Big Data (2025)."},{"key":"e_1_3_2_1_58_1","volume-title":"Math-llava: Bootstrapping mathematical reasoning for multimodal large language models. arXiv preprint arXiv:2406.17294","author":"Shi Wenhao","year":"2024","unstructured":"Wenhao Shi, Zhiqiang Hu, Yi Bin, Junhua Liu, Yang Yang, See-Kiong Ng, Lidong Bing, and Roy Ka-Wei Lee. 2024. Math-llava: Bootstrapping mathematical reasoning for multimodal large language models. arXiv preprint arXiv:2406.17294 (2024)."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"crossref","unstructured":"Amanpreet Singh Vivek Natarajan Meet Shah Yu Jiang Xinlei Chen Dhruv Batra Devi Parikh and Marcus Rohrbach. 2019. Towards vqa models that can read. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 8317-8326.","DOI":"10.1109\/CVPR.2019.00851"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/2812802"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01789"},{"key":"e_1_3_2_1_62_1","first-page":"95095","article-title":"Measuring multimodal mathematical reasoning with math-vision dataset","volume":"37","author":"Wang Ke","year":"2024","unstructured":"Ke Wang, Junting Pan, Weikang Shi, Zimu Lu, Houxing Ren, Aojun Zhou, Mingjie Zhan, and Hongsheng Li. 2024b. Measuring multimodal mathematical reasoning with math-vision dataset. Advances in Neural Information Processing Systems, Vol. 37 (2024), 95095-95169.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_63_1","first-page":"121475","article-title":"Cogvlm: Visual expert for pretrained language models","volume":"37","author":"Wang Weihan","year":"2024","unstructured":"Weihan Wang, Qingsong Lv, Wenmeng Yu, Wenyi Hong, Ji Qi, Yan Wang, Junhui Ji, Zhuoyi Yang, Lei Zhao, Song XiXuan, et al., 2024a. Cogvlm: Visual expert for pretrained language models. Advances in Neural Information Processing Systems, Vol. 37 (2024), 121475-121499.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_64_1","first-page":"20540","article-title":"Needle in a multimodal haystack","volume":"37","author":"Wang Weiyun","year":"2024","unstructured":"Weiyun Wang, Shuibo Zhang, Yiming Ren, Yuchen Duan, Tiantong Li, Shuo Liu, Mengkang Hu, Zhe Chen, Kaipeng Zhang, Lewei Lu, et al., 2024d. Needle in a multimodal haystack. Advances in Neural Information Processing Systems, Vol. 37 (2024), 20540-20565.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_65_1","first-page":"113569","article-title":"Charxiv: Charting gaps in realistic chart understanding in multimodal llms","volume":"37","author":"Wang Zirui","year":"2024","unstructured":"Zirui Wang, Mengzhou Xia, Luxi He, Howard Chen, Yitao Liu, Richard Zhu, Kaiqu Liang, Xindi Wu, Haotian Liu, Sadhika Malladi, et al., 2024c. Charxiv: Charting gaps in realistic chart understanding in multimodal llms. Advances in Neural Information Processing Systems, Vol. 37 (2024), 113569-113697.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_66_1","volume-title":"Logicvista: Multimodal llm logical reasoning benchmark in visual contexts. arXiv preprint arXiv:2407.04973","author":"Xiao Yijia","year":"2024","unstructured":"Yijia Xiao, Edward Sun, Tianyu Liu, and Wei Wang. 2024. Logicvista: Multimodal llm logical reasoning benchmark in visual contexts. arXiv preprint arXiv:2407.04973 (2024)."},{"key":"e_1_3_2_1_67_1","volume-title":"International conference on machine learning. PMLR, 2397-2406","author":"Xiong Caiming","year":"2016","unstructured":"Caiming Xiong, Stephen Merity, and Richard Socher. 2016. Dynamic memory networks for visual and textual question answering. In International conference on machine learning. PMLR, 2397-2406."},{"key":"e_1_3_2_1_68_1","volume-title":"Chartbench: A benchmark for complex visual reasoning in charts. arXiv preprint arXiv:2312.15915","author":"Xu Zhengzhuo","year":"2023","unstructured":"Zhengzhuo Xu, Sinan Du, Yiyan Qi, Chengjin Xu, Chun Yuan, and Jian Guo. 2023. Chartbench: A benchmark for complex visual reasoning in charts. arXiv preprint arXiv:2312.15915 (2023)."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.10"},{"key":"e_1_3_2_1_70_1","volume-title":"The dawn of lmms: Preliminary explorations with gpt-4v (ision). arXiv preprint arXiv:2309.17421","author":"Yang Zhengyuan","year":"2023","unstructured":"Zhengyuan Yang, Linjie Li, Kevin Lin, Jianfeng Wang, Chung-Ching Lin, Zicheng Liu, and Lijuan Wang. 2023. The dawn of lmms: Preliminary explorations with gpt-4v (ision). arXiv preprint arXiv:2309.17421, Vol. 9, 1 (2023), 1."},{"key":"e_1_3_2_1_71_1","volume-title":"Filip: Fine-grained interactive language-image pre-training. arXiv preprint arXiv:2111.07783","author":"Yao Lewei","year":"2021","unstructured":"Lewei Yao, Runhui Huang, Lu Hou, Guansong Lu, Minzhe Niu, Hang Xu, Xiaodan Liang, Zhenguo Li, Xin Jiang, and Chunjing Xu. 2021. Filip: Fine-grained interactive language-image pre-training. arXiv preprint arXiv:2111.07783 (2021)."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1093\/nsr\/nwae403"},{"key":"e_1_3_2_1_73_1","volume-title":"Mmt-bench: A comprehensive multimodal benchmark for evaluating large vision-language models towards multitask agi. arXiv preprint arXiv:2404.16006","author":"Ying Kaining","year":"2024","unstructured":"Kaining Ying, Fanqing Meng, Jin Wang, Zhiqian Li, Han Lin, Yue Yang, Hao Zhang, Wenbo Zhang, Yuqi Lin, Shuo Liu, et al., 2024. Mmt-bench: A comprehensive multimodal benchmark for evaluating large vision-language models towards multitask agi. arXiv preprint arXiv:2404.16006 (2024)."},{"key":"e_1_3_2_1_74_1","volume-title":"Cmmmu: A chinese massive multi-discipline multimodal understanding benchmark. arXiv preprint arXiv:2401.11944","author":"Zhang Ge","year":"2024","unstructured":"Ge Zhang, Xinrun Du, Bei Chen, Yiming Liang, Tongxu Luo, Tianyu Zheng, Kang Zhu, Yuyang Cheng, Chunpu Xu, Shuyue Guo, et al., 2024a. Cmmmu: A chinese massive multi-discipline multimodal understanding benchmark. arXiv preprint arXiv:2401.11944 (2024)."},{"key":"e_1_3_2_1_75_1","volume-title":"Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858","author":"Zhang Hang","year":"2023","unstructured":"Hang Zhang, Xin Li, and Lidong Bing. 2023a. Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)."},{"key":"e_1_3_2_1_76_1","volume-title":"European Conference on Computer Vision. Springer, 169-186","author":"Zhang Renrui","year":"2024","unstructured":"Renrui Zhang, Dongzhi Jiang, Yichi Zhang, Haokun Lin, Ziyu Guo, Pengshuo Qiu, Aojun Zhou, Pan Lu, Kai-Wei Chang, Yu Qiao, et al., 2024b. Mathverse: Does your multi-modal llm truly see the diagrams in visual math problems?. In European Conference on Computer Vision. Springer, 169-186."},{"key":"e_1_3_2_1_77_1","volume-title":"Multimodal chain-of-thought reasoning in language models. arXiv preprint arXiv:2302.00923","author":"Zhang Zhuosheng","year":"2023","unstructured":"Zhuosheng Zhang, Aston Zhang, Mu Li, Hai Zhao, George Karypis, and Alex Smola. 2023b. Multimodal chain-of-thought reasoning in language models. arXiv preprint arXiv:2302.00923 (2023)."},{"key":"e_1_3_2_1_78_1","volume-title":"Marco-o1: Towards open reasoning models for open-ended solutions. arXiv preprint arXiv:2411.14405","author":"Zhao Yu","year":"2024","unstructured":"Yu Zhao, Huifeng Yin, Bo Zeng, Hao Wang, Tianqi Shi, Chenyang Lyu, Longyue Wang, Weihua Luo, and Kaifu Zhang. 2024. Marco-o1: Towards open reasoning models for open-ended solutions. arXiv preprint arXiv:2411.14405 (2024)."},{"key":"e_1_3_2_1_79_1","volume-title":"Multimodal table understanding. arXiv preprint arXiv:2406.08100","author":"Zheng Mingyu","year":"2024","unstructured":"Mingyu Zheng, Xinwei Feng, Qingyi Si, Qiaoqiao She, Zheng Lin, Wenbin Jiang, and Weiping Wang. 2024. Multimodal table understanding. arXiv preprint arXiv:2406.08100 (2024)."},{"key":"e_1_3_2_1_80_1","volume-title":"Reinforced MLLM: A Survey on RL-Based Reasoning in Multimodal Large Language Models. arXiv preprint arXiv:2504.21277","author":"Zhou Guanghao","year":"2025","unstructured":"Guanghao Zhou, Panjia Qiu, Cen Chen, Jie Wang, Zheming Yang, Jian Xu, and Minghui Qiu. 2025a. Reinforced MLLM: A Survey on RL-Based Reasoning in Multimodal Large Language Models. arXiv preprint arXiv:2504.21277 (2025)."},{"key":"e_1_3_2_1_81_1","unstructured":"Pengfei Zhou Fanrui Zhang Xiaopeng Peng Zhaopan Xu Jiaxin Ai Yansheng Qiu Chuanhao Li Zhen Li Ming Li Yukang Feng et al. 2025b. MDK12-Bench: A Multi-Discipline Benchmark for Evaluating Reasoning in Multimodal Large Language Models. arXiv preprint arXiv:2504.05782 (2025)."},{"key":"e_1_3_2_1_82_1","volume-title":"Image-of-thought prompting for visual reasoning refinement in multimodal large language models. arXiv preprint arXiv:2405.13872","author":"Zhou Qiji","year":"2024","unstructured":"Qiji Zhou, Ruochen Zhou, Zike Hu, Panzhong Lu, Siyang Gao, and Yue Zhang. 2024b. Image-of-thought prompting for visual reasoning refinement in multimodal large language models. arXiv preprint arXiv:2405.13872 (2024)."},{"key":"e_1_3_2_1_83_1","volume-title":"Is your model really a good math reasoner? evaluating mathematical reasoning with checklist. arXiv preprint arXiv:2407.08733","author":"Zhou Zihao","year":"2024","unstructured":"Zihao Zhou, Shudong Liu, Maizhen Ning, Wei Liu, Jindong Wang, Derek F Wong, Xiaowei Huang, Qiufeng Wang, and Kaizhu Huang. 2024a. Is your model really a good math reasoner? evaluating mathematical reasoning with checklist. arXiv preprint arXiv:2407.08733 (2024)."},{"key":"e_1_3_2_1_84_1","volume-title":"European Conference on Computer Vision. Springer, 151-168","author":"Zhu Chenming","year":"2024","unstructured":"Chenming Zhu, Tai Wang, Wenwei Zhang, Kai Chen, and Xihui Liu. 2024a. Scanreason: Empowering 3d visual grounding with reasoning capabilities. In European Conference on Computer Vision. Springer, 151-168."},{"key":"e_1_3_2_1_85_1","volume-title":"Multi: Multimodal understanding leaderboard with text and images. arXiv preprint arXiv:2402.03173","author":"Zhu Zichen","year":"2024","unstructured":"Zichen Zhu, Yang Xu, Lu Chen, Jingkai Yang, Yichuan Ma, Yiming Sun, Hailin Wen, Jiaqi Liu, Jinyu Cai, Yingzi Ma, et al., 2024b. Multi: Multimodal understanding leaderboard with text and images. arXiv preprint arXiv:2402.03173 (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758266","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:59:51Z","timestamp":1765342791000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758266"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":85,"alternative-id":["10.1145\/3746027.3758266","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758266","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}