{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:14:01Z","timestamp":1765343641581,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758288","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:55Z","timestamp":1761377215000},"page":"13297-13303","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Chart-HQA: A Benchmark for Hypothetical Question Answering in Charts"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-7255-8973","authenticated-orcid":false,"given":"Xiangnan","family":"Chen","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8783-8484","authenticated-orcid":false,"given":"Yuancheng","family":"Fang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2258-1291","authenticated-orcid":false,"given":"Juncheng","family":"Li","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4829-7516","authenticated-orcid":false,"given":"Qian","family":"Xiao","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8942-3627","authenticated-orcid":false,"given":"Jun","family":"Lin","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7356-9711","authenticated-orcid":false,"given":"Siliang","family":"Tang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9017-2508","authenticated-orcid":false,"given":"Yueting","family":"Zhuang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, et al., 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"OneChart: Purify the Chart Structural Extraction via One Auxiliary Token. arXiv preprint arXiv:2404.09987","author":"Chen Jinyue","year":"2024","unstructured":"Jinyue Chen, Lingyu Kong, Haoran Wei, Chenglong Liu, Zheng Ge, Liang Zhao, Jianjian Sun, Chunrui Han, and Xiangyu Zhang. 2024. OneChart: Purify the Chart Structural Extraction via One Auxiliary Token. arXiv preprint arXiv:2404.09987 (2024)."},{"key":"e_1_3_2_1_3_1","unstructured":"Zhe Chen Weiyun Wang Yue Cao Yangzhou Liu Zhangwei Gao Erfei Cui Jinguo Zhu Shenglong Ye Hao Tian Zhaoyang Liu Lixin Gu Xuehui Wang Qingyun Li Yimin Ren Zixuan Chen Jiapeng Luo Jiahao Wang Tan Jiang Bo Wang Conghui He Botian Shi Xingcheng Zhang Han Lv Yi Wang Wenqi Shao Pei Chu Zhongying Tu Tong He Zhiyong Wu Huipeng Deng Jiaye Ge Kai Chen Kaipeng Zhang Limin Wang Min Dou Lewei Lu Xizhou Zhu Tong Lu Dahua Lin Yu Qiao Jifeng Dai and Wenhai Wang. 2025. Expanding Performance Boundaries of Open-Source Multimodal Models with Model Data and Test-Time Scaling."},{"key":"e_1_3_2_1_4_1","volume-title":"HALLUSIONBENCH: An Advanced Diagnostic Suite for Entangled Language Hallucination & Visual Illusion in Large Vision-Language Models. arXiv preprint arXiv:2310.14566","author":"Guan Tianrui","year":"2023","unstructured":"Tianrui Guan, Fuxiao Liu, Xiyang Wu, et al., 2023. HALLUSIONBENCH: An Advanced Diagnostic Suite for Entangled Language Hallucination & Visual Illusion in Large Vision-Language Models. arXiv preprint arXiv:2310.14566 (2023)."},{"key":"e_1_3_2_1_5_1","unstructured":"Yucheng Han Chi Zhang Xin Chen et al. 2023a. ChartLlama: A Multimodal LLM for Chart Understanding and Generation. arXiv preprint arXiv:2311.16483 (2023)."},{"key":"e_1_3_2_1_6_1","unstructured":"Yucheng Han Chi Zhang Xin Chen Xu Yang Zhibin Wang Gang Yu Bin Fu and Hanwang Zhang. 2023b. ChartLlama: A Multimodal LLM for Chart Understanding and Generation. arXiv:2311.16483"},{"key":"e_1_3_2_1_7_1","volume-title":"SciCap: Generating captions for scientific figures. arXiv preprint arXiv:2110.11624","author":"Hsu Ting-Yao","year":"2021","unstructured":"Ting-Yao Hsu, C Lee Giles, and Ting-Hao'Kenneth' Huang. 2021. SciCap: Generating captions for scientific figures. arXiv preprint arXiv:2110.11624 (2021)."},{"key":"e_1_3_2_1_8_1","unstructured":"Anwen Hu Haiyang Xu Liang Zhang Jiabo Ye Ming Yan Ji Zhang Qin Jin Fei Huang and Jingren Zhou. 2024. mPLUG-DocOwl2: High-resolution Compressing for OCR-free Multi-page Document Understanding. arXiv:2409.03420"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.41"},{"key":"e_1_3_2_1_10_1","volume-title":"Xiang Lin, et al.","author":"Kantharaj Shankar","year":"2022","unstructured":"Shankar Kantharaj, Rixie Tiffany Ko Leong, Xiang Lin, et al., 2022. Chart-to-text: A large-scale benchmark for chart summarization. arXiv preprint arXiv:2203.06486 (2022)."},{"key":"e_1_3_2_1_11_1","volume-title":"International Conference on Machine Learning. 18893-18912","author":"Lee Kenton","year":"2023","unstructured":"Kenton Lee, Mandar Joshi, Iulia Raluca Turc, Hexiang Hu, Fangyu Liu, Julian Martin Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, and Kristina Toutanova. 2023. Pix2struct: Screenshot parsing as pretraining for visual language understanding. In International Conference on Machine Learning. 18893-18912."},{"key":"e_1_3_2_1_12_1","first-page":"0","volume-title":"ICML","volume":"202","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, et al., 2023a. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In ICML, Vol. 202. 19730-19742."},{"key":"e_1_3_2_1_13_1","volume-title":"Scigraphqa: A large-scale synthetic multi-turn question-answering dataset for scientific graphs. arXiv preprint arXiv:2308.03349","author":"Li Shengzhi","year":"2023","unstructured":"Shengzhi Li and Nima Tajbakhsh. 2023. Scigraphqa: A large-scale synthetic multi-turn question-answering dataset for scientific graphs. arXiv preprint arXiv:2308.03349 (2023)."},{"key":"e_1_3_2_1_14_1","volume-title":"Monkey: Image resolution and text label are important things for large multi-modal models. arXiv preprint arXiv:2311.06607","author":"Li Zhang","year":"2023","unstructured":"Zhang Li, Biao Yang, Qiang Liu, Zhiyin Ma, Shuo Zhang, Jingxu Yang, Yabo Sun, Yuliang Liu, and Xiang Bai. 2023b. Monkey: Image resolution and text label are important things for large multi-modal models. arXiv preprint arXiv:2311.06607 (2023)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.660"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.714"},{"key":"e_1_3_2_1_17_1","unstructured":"Haotian Liu Chunyuan Li Yuheng Li et al. 2023c. Improved Baselines with Visual Instruction Tuning. arXiv preprint arXiv:2310.03744 (2023)."},{"key":"e_1_3_2_1_18_1","unstructured":"Haotian Liu Chunyuan Li Yuheng Li and Yong Jae Lee. 2023b. Improved Baselines with Visual Instruction Tuning. arXiv:2310.03744"},{"key":"e_1_3_2_1_19_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong Jae Lee. 2023d. Visual Instruction Tuning. In NeurIPS."},{"key":"e_1_3_2_1_20_1","unstructured":"Haoyu Lu Wen Liu Bo Zhang Bingxuan Wang Kai Dong Bo Liu Jingxiang Sun Tongzheng Ren Zhuoshu Li Hao Yang Yaofeng Sun Chengqi Deng Hanwei Xu Zhenda Xie and Chong Ruan. 2024. DeepSeek-VL: Towards Real-World Vision-Language Understanding. arXiv:2403.05525"},{"key":"e_1_3_2_1_21_1","volume-title":"Jia Qing Tan, Shafiq Joty, and Enamul Hoque.","author":"Masry Ahmed","year":"2022","unstructured":"Ahmed Masry, Xuan Long Do, Jia Qing Tan, Shafiq Joty, and Enamul Hoque. 2022. ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. In Findings of the Association for Computational Linguistics: ACL 2022. 2263-2279."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.906"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093523"},{"key":"e_1_3_2_1_24_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_25_1","volume-title":"Abdullah Al Farhad, et al","author":"Rahman Raian","year":"2023","unstructured":"Raian Rahman, Rizvi Hasan, Abdullah Al Farhad, et al., 2023. ChartSumm: A Comprehensive Benchmark for Automatic Chart Summarization of Long and Short Summaries. arXiv preprint arXiv:2304.13620 (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems.","author":"Shinn Noah","year":"2023","unstructured":"Noah Shinn, Federico Cassano, Ashwin Gopinath, Karthik R Narasimhan, and Shunyu Yao. 2023. Reflexion: Language agents with verbal reinforcement learning. In Thirty-seventh Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_27_1","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Yonghui Wu Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M Dai Anja Hauth et al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)."},{"key":"e_1_3_2_1_28_1","unstructured":"Qwen Team. 2025. Qwen2.5-VL. https:\/\/qwenlm.github.io\/blog\/qwen2.5-vl\/"},{"key":"e_1_3_2_1_29_1","unstructured":"Qwen-VL Team. 2024. Qwen-VL-Max."},{"key":"e_1_3_2_1_30_1","unstructured":"Weihan Wang Qingsong Lv Wenmeng Yu et al. 2023. CogVLM: Visual Expert for Pretrained Language Models. arXiv preprint arXiv:2311.03079 (2023)."},{"key":"e_1_3_2_1_31_1","volume-title":"Self-instruct: Aligning language model with self generated instructions. arXiv preprint arXiv:2212.10560","author":"Wang Yizhong","year":"2022","unstructured":"Yizhong Wang, Yeganeh Kordi, Swaroop Mishra, et al., 2022. Self-instruct: Aligning language model with self generated instructions. arXiv preprint arXiv:2212.10560 (2022)."},{"key":"e_1_3_2_1_32_1","unstructured":"Renqiu Xia Bo Zhang Hancheng Ye Xiangchao Yan Qi Liu Hongbin Zhou Zijun Chen Min Dou Botian Shi Junchi Yan et al. 2024b. ChartX & ChartVLM: A Versatile Benchmark and Foundation Model for Complicated Chart Reasoning. arXiv preprint arXiv:2402.12185 (2024)."},{"key":"e_1_3_2_1_33_1","unstructured":"Renqiu Xia Bo Zhang Hancheng Ye Xiangchao Yan Qi Liu Hongbin Zhou Zijun Chen Min Dou Botian Shi Junchi Yan and Yu Qiao. 2024a. ChartX & ChartVLM: A Versatile Benchmark and Foundation Model for Complicated Chart Reasoning. arXiv:2402.12185"},{"key":"e_1_3_2_1_34_1","unstructured":"Zhengzhuo Xu Sinan Du Yiyan Qi Chengjin Xu Chun Yuan and Jian Guo. 2024. ChartBench: A Benchmark for Complex Visual Reasoning in Charts. arXiv:2312.15915"},{"key":"e_1_3_2_1_35_1","volume-title":"TinyChart: Efficient Chart Understanding with Visual Token Merging and Program-of-Thoughts Learning. arXiv preprint arXiv:2404.16635","author":"Zhang Liang","year":"2024","unstructured":"Liang Zhang, Anwen Hu, Haiyang Xu, Ming Yan, Yichen Xu, Qin Jin, Ji Zhang, and Fei Huang. 2024. TinyChart: Efficient Chart Understanding with Visual Token Merging and Program-of-Thoughts Learning. arXiv preprint arXiv:2404.16635 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"Roy Ka-wei Lee, et al","author":"Zhu Jiawen","year":"2021","unstructured":"Jiawen Zhu, Jinye Ran, Roy Ka-wei Lee, et al., 2021. AutoChart: A dataset for chart-to-text generation task. arXiv preprint arXiv:2108.06897 (2021)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758288","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:09:07Z","timestamp":1765343347000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758288"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":36,"alternative-id":["10.1145\/3746027.3758288","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758288","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}