{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T09:59:50Z","timestamp":1777888790775,"version":"3.51.4"},"reference-count":46,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key R&D Program of China","doi-asserted-by":"publisher","award":["2023YFA1008500"],"award-info":[{"award-number":["2023YFA1008500"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.00380","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"3988-3998","source":"Crossref","is-referenced-by-count":0,"title":["Integrating Visual Interpretation and Linguistic Reasoning for Geometric Problem Solving"],"prefix":"10.1109","author":[{"given":"Zixian","family":"Guo","sequence":"first","affiliation":[{"name":"Harbin Institute of Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ming","family":"Liu","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qilong","family":"Wang","sequence":"additional","affiliation":[{"name":"Tianjin University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhilong","family":"Ji","sequence":"additional","affiliation":[{"name":"Tomorrow Advancing Life"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jinfeng","family":"Bai","sequence":"additional","affiliation":[{"name":"Tomorrow Advancing Life"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lei","family":"Zhang","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wangmeng","family":"Zuo","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref2","article-title":"Qwen technical report","author":"Bai","year":"2023","journal-title":"arXiv preprint"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0850"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-99-9283-6_573"},{"key":"ref5","author":"Deng","year":"2024","journal-title":"R-cot: Reverse chain-of-thought problem generation for geometric reasoning in large multimodal models"},{"key":"ref6","author":"Gao","year":"2023","journal-title":"G-llava: Solving geometric problem with multi-modal large language model"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.01818"},{"key":"ref8","article-title":"Deepseek-r1: Incentivizing reasoning capability in 11 ms via reinforcement learning","author":"Guo","year":"2025","journal-title":"arXiv preprint"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-emnlp.766"},{"key":"ref10","author":"Hao","year":"2025","journal-title":"Can mllms reason in multimodality? emma: An enhanced multimodal reasoning benchmark"},{"key":"ref11","author":"Hosseini","year":"2024","journal-title":"Vstar: Training verifiers for self-taught reasoners"},{"key":"ref12","article-title":"Why vision language models struggle with visual arithmetic?","author":"Huang","year":"2025","journal-title":"towards enhanced chart and geometry understanding"},{"key":"ref13","author":"Kumar","year":"2024","journal-title":"Training language models to self-correct via reinforcement learning"},{"key":"ref14","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"International conference on machine learning","author":"Li","year":"2022"},{"key":"ref15","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"International conference on machine learning","author":"Li","year":"2023"},{"key":"ref16","author":"Lightman","year":"2023","journal-title":"Let\u2019s verify step by step"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"ref20","author":"Lu","year":"2024","journal-title":"Mathvista: Evaluating mathematical reasoning of foundation models in visual contexts"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.410"},{"key":"ref22","author":"Lyu","year":"2025","journal-title":"Exploring the limit of outcome reward for learning mathematical reasoning"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.emnlp-main.1025"},{"key":"ref24","author":"Achiam","year":"2024","journal-title":"Gpt-4 technical report"},{"key":"ref25","author":"Peng","year":"2024","journal-title":"Multimath: Bridging visual and mathematical reasoning for large language models"},{"key":"ref26","author":"Qi","year":"2024","journal-title":"Mutual reasoning makes smaller 11 ms stronger problem-solvers"},{"key":"ref27","author":"Qin","year":"2024","journal-title":"O1 replication journey: A strategic progress report - part 1"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2338"},{"key":"ref29","article-title":"Deepseekmath: Pushing the limits of mathematical reasoning in open language models","author":"Shao","year":"2024","journal-title":"arXiv preprint"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.268"},{"key":"ref31","author":"Snell","year":"2024","journal-title":"Scaling llm test-time compute optimally can be more effective than scaling model parameters"},{"key":"ref32","article-title":"Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.510"},{"key":"ref34","author":"Wei","year":"2024","journal-title":"Slow perception: Let\u2019s perceive geometric figures step-by-step"},{"key":"ref35","author":"Wu","year":"2023","journal-title":"The role of chain-of-thought in complex vision-language reasoning task"},{"key":"ref36","author":"Xiang","year":"2024","journal-title":"Atomthink: A slow thinking framework for multimodal mathematical reasoning"},{"key":"ref37","author":"Xu","year":"2024","journal-title":"Llava-o1: Let vision language models reason step-by-step"},{"key":"ref38","author":"Yao","year":"2024","journal-title":"Mulberry: Empowering mllm with o1-like reasoning and reflection via collective monte carlo tree search"},{"key":"ref39","article-title":"Metamath: Bootstrap your own mathematical questions for large language models","author":"Yu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref40","article-title":"Mm -vet: Evaluating large multimodal models for integrated capabilities","author":"Yu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref41","author":"Zhang","year":"2024","journal-title":"Llama-berry: Pairwise optimization for o1-like olympiad-level mathematical reasoning"},{"key":"ref42","author":"Zhang","year":"2024","journal-title":"Euclid: Supercharging multimodal 11 ms with synthetic high-fidelity visual descriptions"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73242-3_10"},{"key":"ref44","article-title":"Improve vision language model chain-ofthought reasoning","author":"Zhang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref45","author":"Zhang","year":"2024","journal-title":"How far are we from intelligent visual deductive reasoning?"},{"key":"ref46","article-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023","journal-title":"arXiv preprint"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11445998.pdf?arnumber=11445998","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T04:57:34Z","timestamp":1777611454000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11445998\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":46,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.00380","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}