{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:05:12Z","timestamp":1765343112910,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":29,"publisher":"ACM","funder":[{"name":"National Key R&D Program of China","award":["2024YFA1014003"],"award-info":[{"award-number":["2024YFA1014003"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["92470121"],"award-info":[{"award-number":["92470121"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758240","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:37:21Z","timestamp":1761377841000},"page":"12942-12948","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MathScape: Benchmarking Multimodal Large Language Models in Real-World Mathematical Contexts"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-2963-2210","authenticated-orcid":false,"given":"Hao","family":"Liang","sequence":"first","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0363-3607","authenticated-orcid":false,"given":"Linzhuang","family":"Sun","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5416-3157","authenticated-orcid":false,"given":"zhouminxuan","family":"zhouminxuan","sequence":"additional","affiliation":[{"name":"Nankai University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6907-8289","authenticated-orcid":false,"given":"Zirong","family":"Chen","sequence":"additional","affiliation":[{"name":"Beijing Institute of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3009-2886","authenticated-orcid":false,"given":"Meiyi","family":"Qiang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, Bei Jing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2104-4641","authenticated-orcid":false,"given":"Mingan","family":"Lin","sequence":"additional","affiliation":[{"name":"Baichuan Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1651-955X","authenticated-orcid":false,"given":"Tianpeng","family":"Li","sequence":"additional","affiliation":[{"name":"Baichuan Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0817-6508","authenticated-orcid":false,"given":"Fan","family":"Yang","sequence":"additional","affiliation":[{"name":"Baichuan Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5562-9604","authenticated-orcid":false,"given":"Zenan","family":"Zhou","sequence":"additional","affiliation":[{"name":"Baichuan Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7532-5550","authenticated-orcid":false,"given":"Wentao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Mc-llava: Multi-concept personalized vision-language model. arXiv preprint arXiv:2411.11706","author":"An Ruichuan","year":"2024","unstructured":"Ruichuan An, Sihan Yang, Ming Lu, Renrui Zhang, Kai Zeng, Yulin Luo, Jiajun Cao, Hao Liang, Ying Chen, Qi She, et al., 2024. Mc-llava: Multi-concept personalized vision-language model. arXiv preprint arXiv:2411.11706 (2024)."},{"key":"e_1_3_2_1_2_1","unstructured":"Ruichuan An Sihan Yang Renrui Zhang Zijun Shen Ming Lu Gaole Dai Hao Liang Ziyu Guo Shilin Yan Yulin Luo et al. 2025. UniCTokens: Boosting Personalized Understanding and Generation via Unified Concept Tokens. arXiv preprint arXiv:2505.14671 (2025)."},{"key":"e_1_3_2_1_3_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang et al. 2023a. Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond.","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023b. Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond. (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"A Survey of Multimodal Large Language Model from A Data-centric Perspective. arXiv preprint arXiv:2405.16640","author":"Bai Tianyi","year":"2024","unstructured":"Tianyi Bai, Hao Liang, Binwang Wan, Ling Yang, Bozhou Li, Yifan Wang, Bin Cui, Conghui He, Binhang Yuan, and Wentao Zhang. 2024. A Survey of Multimodal Large Language Model from A Data-centric Perspective. arXiv preprint arXiv:2405.16640 (2024)."},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24185-24198","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, et al., 2024. Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24185-24198."},{"key":"e_1_3_2_1_7_1","volume-title":"G-llava: Solving geometric problem with multi-modal large language model. arXiv preprint arXiv:2312.11370","author":"Gao Jiahui","year":"2023","unstructured":"Jiahui Gao, Renjie Pi, Jipeng Zhang, Jiacheng Ye, Wanjun Zhong, Yufei Wang, Lanqing Hong, Jianhua Han, Hang Xu, Zhenguo Li, et al., 2023. G-llava: Solving geometric problem with multi-modal large language model. arXiv preprint arXiv:2312.11370 (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv preprint arXiv:2406.12793","author":"Aohan Zeng Team GLM","year":"2024","unstructured":"Team GLM, Aohan Zeng, Bin Xu, Bowen Wang, Chenhui Zhang, Da Yin, Dan Zhang, Diego Rojas, Guanyu Feng, Hanlin Zhao, et al., 2024. Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv preprint arXiv:2406.12793 (2024)."},{"key":"e_1_3_2_1_9_1","volume-title":"CMMU: A Benchmark for Chinese Multi-modal Multi-type Question Understanding and Reasoning. arXiv preprint arXiv:2401.14011","author":"He Zheqi","year":"2024","unstructured":"Zheqi He, Xinya Wu, Pengfei Zhou, Richeng Xuan, Guang Liu, Xi Yang, Qiannan Zhu, and Hua Huang. 2024. CMMU: A Benchmark for Chinese Multi-modal Multi-type Question Understanding and Reasoning. arXiv preprint arXiv:2401.14011 (2024)."},{"key":"e_1_3_2_1_10_1","volume-title":"Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326","author":"Li Bo","year":"2024","unstructured":"Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Peiyuan Zhang, Yanwei Li, Ziwei Liu, et al., 2024. Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326 (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023a. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_12_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023b. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_13_1","volume-title":"Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2023a. Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)."},{"key":"e_1_3_2_1_14_1","volume-title":"Visual Instruction Tuning. In Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023b. Visual Instruction Tuning. In Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023."},{"key":"e_1_3_2_1_15_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_16_1","unstructured":"Haoyu Lu Wen Liu Bo Zhang Bingxuan Wang Kai Dong Bo Liu Jingxiang Sun Tongzheng Ren Zhuoshu Li Yaofeng Sun et al. 2024. Deepseek-vl: towards real-world vision-language understanding. arXiv preprint arXiv:2403.05525 (2024)."},{"key":"e_1_3_2_1_17_1","volume-title":"Mathvista: Evaluating mathematical reasoning of foundation models in visual contexts. arXiv preprint arXiv:2310.02255","author":"Lu Pan","year":"2023","unstructured":"Pan Lu, Hritik Bansal, Tony Xia, Jiacheng Liu, Chunyuan Li, Hannaneh Hajishirzi, Hao Cheng, Kai-Wei Chang, Michel Galley, and Jianfeng Gao. 2023. Mathvista: Evaluating mathematical reasoning of foundation models in visual contexts. arXiv preprint arXiv:2310.02255 (2023)."},{"key":"e_1_3_2_1_18_1","volume-title":"European Conference on Computer Vision. Springer, 235-252","author":"Luo Yulin","year":"2024","unstructured":"Yulin Luo, Ruichuan An, Bocheng Zou, Yiming Tang, Jiaming Liu, and Shanghang Zhang. 2024. Llm as dataset analyst: Subpopulation structure discovery with large language model. In European Conference on Computer Vision. Springer, 235-252."},{"key":"e_1_3_2_1_19_1","unstructured":"OpenAI. 2023a. ChatGPT. https:\/\/openai.com\/blog\/chatgpt"},{"key":"e_1_3_2_1_20_1","volume-title":"View in Article","volume":"2","author":"R","year":"2023","unstructured":"R OpenAI. 2023b. Gpt-4 technical report. arxiv 2303.08774. View in Article, Vol. 2, 5 (2023)."},{"key":"e_1_3_2_1_21_1","unstructured":"Machel Reid Nikolay Savinov Denis Teplyashin Dmitry Lepikhin Timothy Lillicrap Jean-baptiste Alayrac Radu Soricut Angeliki Lazaridou Orhan Firat Julian Schrittwieser et al. 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530 (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"Math-llava: Bootstrapping mathematical reasoning for multimodal large language models. arXiv preprint arXiv:2406.17294","author":"Shi Wenhao","year":"2024","unstructured":"Wenhao Shi, Zhiqiang Hu, Yi Bin, Junhua Liu, Yang Yang, See-Kiong Ng, Lidong Bing, and Roy Ka-Wei Lee. 2024. Math-llava: Bootstrapping mathematical reasoning for multimodal large language models. arXiv preprint arXiv:2406.17294 (2024)."},{"key":"e_1_3_2_1_23_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al., 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"Measuring multimodal mathematical reasoning with math-vision dataset. arXiv preprint arXiv:2402.14804","author":"Wang Ke","year":"2024","unstructured":"Ke Wang, Junting Pan, Weikang Shi, Zimu Lu, Mingjie Zhan, and Hongsheng Li. 2024. Measuring multimodal mathematical reasoning with math-vision dataset. arXiv preprint arXiv:2402.14804 (2024)."},{"key":"e_1_3_2_1_25_1","volume-title":"Multimodal large language models: A survey. arXiv preprint arXiv:2311.13165","author":"Wu Jiayang","year":"2023","unstructured":"Jiayang Wu, Wensheng Gan, Zefeng Chen, Shicheng Wan, and Philip S Yu. 2023. Multimodal large language models: A survey. arXiv preprint arXiv:2311.13165 (2023)."},{"key":"e_1_3_2_1_26_1","unstructured":"Aiyuan Yang Bin Xiao Bingning Wang Borong Zhang Ce Bian Chao Yin Chenxu Lv Da Pan Dian Wang Dong Yan et al. 2023. Baichuan 2: Open large-scale language models. arXiv preprint arXiv:2309.10305 (2023)."},{"key":"e_1_3_2_1_27_1","volume-title":"Yi: Open foundation models by 01. ai. arXiv preprint arXiv:2403.04652","author":"Young Alex","year":"2024","unstructured":"Alex Young, Bei Chen, Chao Li, Chengen Huang, Ge Zhang, Guanwei Zhang, Heng Li, Jiangcheng Zhu, Jianqun Chen, Jing Chang, et al., 2024. Yi: Open foundation models by 01. ai. arXiv preprint arXiv:2403.04652 (2024)."},{"key":"e_1_3_2_1_28_1","volume-title":"Mathverse: Does your multi-modal llm truly see the diagrams in visual math problems? arXiv preprint arXiv:2403.14624","author":"Zhang Renrui","year":"2024","unstructured":"Renrui Zhang, Dongzhi Jiang, Yichi Zhang, Haokun Lin, Ziyu Guo, Pengshuo Qiu, Aojun Zhou, Pan Lu, Kai-Wei Chang, Peng Gao, et al., 2024. Mathverse: Does your multi-modal llm truly see the diagrams in visual math problems? arXiv preprint arXiv:2403.14624 (2024)."},{"key":"e_1_3_2_1_29_1","unstructured":"Wayne Xin Zhao Kun Zhou Junyi Li Tianyi Tang Xiaolei Wang Yupeng Hou Yingqian Min Beichen Zhang Junjie Zhang Zican Dong et al. 2023. A survey of large language models. arXiv preprint arXiv:2303.18223 (2023)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758240","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:01:16Z","timestamp":1765342876000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758240"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":29,"alternative-id":["10.1145\/3746027.3758240","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758240","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}