{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T18:01:09Z","timestamp":1775325669916,"version":"3.50.1"},"publisher-location":"Cham","reference-count":48,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031732416","type":"print"},{"value":"9783031732423","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:00:00Z","timestamp":1730160000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:00:00Z","timestamp":1730160000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73242-3_10","type":"book-chapter","created":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T09:15:43Z","timestamp":1730106943000},"page":"169-186","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":38,"title":["MATHVERSE: Does Your Multi-modal LLM Truly See the\u00a0Diagrams in\u00a0Visual Math Problems?"],"prefix":"10.1007","author":[{"given":"Renrui","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Dongzhi","family":"Jiang","sequence":"additional","affiliation":[]},{"given":"Yichi","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Haokun","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Ziyu","family":"Guo","sequence":"additional","affiliation":[]},{"given":"Pengshuo","family":"Qiu","sequence":"additional","affiliation":[]},{"given":"Aojun","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Pan","family":"Lu","sequence":"additional","affiliation":[]},{"given":"Kai-Wei","family":"Chang","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Qiao","sequence":"additional","affiliation":[]},{"given":"Peng","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Hongsheng","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,29]]},"reference":[{"key":"10_CR1","unstructured":"Bai, J., et al.: Qwen-VL: a versatile vision-language model for understanding, localization, text reading, and beyond. arXiv preprint arXiv:2308.12966 (2023)"},{"key":"10_CR2","unstructured":"Brown, T., et\u00a0al.: Language models are few-shot learners. In: Advances in neural information processing systems, pp. 1877\u20131901 (2020)"},{"key":"10_CR3","unstructured":"Cai, Z., et\u00a0al.: InternLM2 technical report. arXiv preprint arXiv:2403.17297 (2024)"},{"key":"10_CR4","unstructured":"Cao, J., Xiao, J.: An augmented benchmark dataset for geometric question answering through dual parallel text encoding. In: Proceedings of the 29th International Conference on Computational Linguistics, pp. 1511\u20131520 (2022)"},{"key":"10_CR5","doi-asserted-by":"crossref","unstructured":"Chen, J., et al.: UniGeo: unifying geometry logical reasoning via reformulating mathematical expression. arXiv preprint arXiv:2212.02746 (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.218"},{"key":"10_CR6","doi-asserted-by":"crossref","unstructured":"Chen, J., et al.: GeoQA: a geometric question answering benchmark towards multimodal numerical reasoning. arXiv preprint arXiv:2105.14517 (2021)","DOI":"10.18653\/v1\/2021.findings-acl.46"},{"key":"10_CR7","unstructured":"Chen, J., Li, D.Z.X.S.X., Zhang, Z.L.P., Xiong, R.K.V.C.Y., Elhoseiny, M.: MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478 (2023)"},{"key":"10_CR8","unstructured":"Chen, L., et al.: ShareGPT4v: improving large multi-modal models with better captions. ArXiv abs\/2311.12793 (2023). https:\/\/api.semanticscholar.org\/CorpusID:265308687"},{"key":"10_CR9","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* chatGPT quality (2023). https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"10_CR10","unstructured":"Dong, X., et\u00a0al.: InternLM-Xcomposer2: mastering free-form text-image composition and comprehension in vision-language large model. arXiv preprint arXiv:2401.16420 (2024)"},{"key":"10_CR11","unstructured":"Fu, C., et al.: MME: a comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394 (2023)"},{"key":"10_CR12","unstructured":"Fu, C., et\u00a0al.: A challenger to GPT-4V? Early explorations of Gemini in visual expertise. arXiv preprint arXiv:2312.12436 (2023)"},{"key":"10_CR13","unstructured":"Gao, J., et\u00a0al.: G-LLaVA: solving geometric problem with multi-modal large language model. arXiv preprint arXiv:2312.11370 (2023)"},{"key":"10_CR14","unstructured":"Gao, P., et al.: LLaMA-adapter v2: parameter-efficient visual instruction model. arXiv preprint arXiv:2304.15010 (2023)"},{"key":"10_CR15","unstructured":"Gao, P., et\u00a0al.: SPHINX-X: scaling data and parameters for a family of multi-modal large language models. arXiv preprint arXiv:2402.05935 (2024)"},{"key":"10_CR16","unstructured":"Gemini\u00a0Team, G.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"10_CR17","unstructured":"Guo, Z., et al.: Sciverse (2024). https:\/\/sciverse-cuhk.github.io"},{"key":"10_CR18","unstructured":"Guo, Z., et\u00a0al.: Point-bind & point-LLM: aligning point cloud with multi-modality for 3D understanding, generation, and instruction following. arXiv preprint arXiv:2309.00615 (2023)"},{"key":"10_CR19","unstructured":"Han, J., et\u00a0al.: ImageBind-LLM: multi-modality instruction tuning. arXiv preprint arXiv:2309.03905 (2023)"},{"key":"10_CR20","unstructured":"Jiang, A.Q., et al.: Mixtral of experts. Arxiv arXiv:2401.04088 (2024)"},{"key":"10_CR21","unstructured":"Li, B., et al.: LLaVA-next: stronger LLMs supercharge multimodal capabilities in the wild (2024). https:\/\/llava-vl.github.io\/blog\/2024-05-10-llava-next-stronger-llms\/"},{"key":"10_CR22","unstructured":"Li, F., et al.: LLaVA-next-interleave: tackling multi-image, video, and 3D in large multimodal models. arXiv preprint arXiv:2407.07895 (2024)"},{"key":"10_CR23","doi-asserted-by":"crossref","unstructured":"Lin, Z., et\u00a0al.: Sphinx: the joint mixing of weights, tasks, and visual embeddings for multi-modal large language models. arXiv preprint arXiv:2311.07575 (2023)","DOI":"10.1007\/978-3-031-73033-7_3"},{"key":"10_CR24","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"10_CR25","unstructured":"Liu, H., et al.: LLaVA-next: improved reasoning, ocr, and world knowledge (2024). https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next\/"},{"key":"10_CR26","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeurIPS (2023)"},{"key":"10_CR27","doi-asserted-by":"crossref","unstructured":"Liu, Y., et\u00a0al.: MMBench: is your multi-modal model an all-around player? arXiv preprint arXiv:2307.06281 (2023)","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"10_CR28","unstructured":"Lu, P., et al.: MathVista: evaluating math reasoning in visual contexts with GPT-4V, bard, and other large multimodal models. ArXiv abs\/2310.02255 (2023)"},{"key":"10_CR29","doi-asserted-by":"crossref","unstructured":"Lu, P., et al.: Inter-GPS: interpretable geometry problem solving with formal language and symbolic reasoning. arXiv preprint arXiv:2105.04165 (2021)","DOI":"10.18653\/v1\/2021.acl-long.528"},{"key":"10_CR30","unstructured":"OpenAI: ChatGPT (2023). https:\/\/chat.openai.com"},{"key":"10_CR31","unstructured":"OpenAI: GPT-4 technical report. ArXiv abs\/2303.08774 (2023)"},{"key":"10_CR32","unstructured":"OpenAI: GPT-4V(ision) system card (2023).https:\/\/openai.com\/research\/gpt-4v-system-card"},{"key":"10_CR33","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. In: Advances in Neural Information Processing Systems (2022)"},{"key":"10_CR34","doi-asserted-by":"crossref","unstructured":"Seo, M., Hajishirzi, H., Farhadi, A., Etzioni, O., Malcolm, C.: Solving geometry problems: combining text and diagram interpretation. In: Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing, pp. 1466\u20131476 (2015)","DOI":"10.18653\/v1\/D15-1171"},{"key":"10_CR35","doi-asserted-by":"crossref","unstructured":"Shi, W., et al.: Math-LLaVA: bootstrapping mathematical reasoning for multimodal large language models. arXiv preprint arXiv:2406.17294 (2024)","DOI":"10.18653\/v1\/2024.findings-emnlp.268"},{"key":"10_CR36","unstructured":"Sun, K., et\u00a0al.: JourneyDB: a benchmark for generative image understanding. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"10_CR37","unstructured":"Touvron, H., et\u00a0al.: LLaMA: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"10_CR38","unstructured":"Touvron, H., et\u00a0al.: LLaMA 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"10_CR39","unstructured":"Wei, J., et al.: Chain-of-thought prompting elicits reasoning in large language models. Advances in Neural Information Processing Systems, vol. 35, pp. 24824\u201324837 (2022)"},{"key":"10_CR40","unstructured":"Ye, Q., et al.: mPLUG-Owl: modularization empowers large language models with multimodality (2023)"},{"key":"10_CR41","doi-asserted-by":"crossref","unstructured":"Ye, Q., et al.: mPLUG-Owl2: revolutionizing multi-modal large language model with modality collaboration (2023)","DOI":"10.1109\/CVPR52733.2024.01239"},{"key":"10_CR42","unstructured":"Ying, K., et\u00a0al.: MMT-bench: a comprehensive multimodal benchmark for evaluating large vision-language models towards multitask AGI. In: ICML 2024 (2024)"},{"key":"10_CR43","unstructured":"Young, A., et\u00a0al.: Yi: Open foundation models by 01. AI. arXiv preprint arXiv:2403.04652 (2024)"},{"key":"10_CR44","doi-asserted-by":"crossref","unstructured":"Yue, X., et al.: MMMU: a massive multi-discipline multimodal understanding and reasoning benchmark for expert AGI. arXiv preprint arXiv:2311.16502 (2023)","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"10_CR45","unstructured":"Yue, X., Zheng, T., Zhang, G., Chen, W.: MAmmoTH2: scaling instructions from the web. arXiv preprint arXiv:2405.03548 (2024)"},{"key":"10_CR46","unstructured":"Zhang, R., et al.: LLaMA-adapter: efficient fine-tuning of large language models with zero-initialized attention. In: The Twelfth International Conference on Learning Representations (2024). https:\/\/openreview.net\/forum?id=d4UiXAHN2W"},{"key":"10_CR47","unstructured":"Zhang, R., et\u00a0al.: MAVIS: mathematical visual instruction tuning. arXiv preprint arXiv:2407.08739 (2024)"},{"key":"10_CR48","unstructured":"Zheng, L., et\u00a0al.: Judging LLM-as-a-judge with MT-bench and chatbot arena. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73242-3_10","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T10:26:30Z","timestamp":1732962390000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73242-3_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,29]]},"ISBN":["9783031732416","9783031732423"],"references-count":48,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73242-3_10","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,29]]},"assertion":[{"value":"29 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}