{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T14:59:11Z","timestamp":1775228351500,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62471175"],"award-info":[{"award-number":["62471175"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754965","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:47:18Z","timestamp":1761374838000},"page":"3057-3066","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["GeoUni: A Unified Model for Generating Geometry Diagrams, Problems and Problem Solutions"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-4165-1384","authenticated-orcid":false,"given":"Jo-Ku","family":"Cheng","sequence":"first","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0573-0339","authenticated-orcid":false,"given":"Zeren","family":"Zhang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9960-2637","authenticated-orcid":false,"given":"Ran","family":"Chen","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3487-225X","authenticated-orcid":false,"given":"Jingyang","family":"Deng","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7206-3786","authenticated-orcid":false,"given":"Ziran","family":"Qin","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7388-4295","authenticated-orcid":false,"given":"Jinwen","family":"Ma","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Jinze Bai Shuai Bai Shusheng Yang Shijie Wang Sinan Tan Peng Wang Junyang Lin Chang Zhou and Jingren Zhou. 2023. Qwen-VL: A Versatile Vision-Language Model for Understanding Localization Text Reading and Beyond. arXiv:2308.12966 [cs.CV] https:\/\/arxiv.org\/abs\/2308.12966"},{"key":"e_1_3_2_1_2_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang Humen Zhong Yuanzhi Zhu Mingkun Yang Zhaohai Li Jianqiang Wan Pengfei Wang Wei Ding Zheren Fu Yiheng Xu Jiabo Ye Xi Zhang Tianbao Xie Zesen Cheng Hang Zhang Zhibo Yang Haiyang Xu and Junyang Lin. 2025. Qwen2.5-VL Technical Report. arXiv:2502.13923 [cs.CV] https:\/\/arxiv.org\/abs\/2502.13923"},{"key":"e_1_3_2_1_3_1","volume-title":"Mansheej Paul, Philip Greengard, Connor Jennings, Daniel King, Sam Havens, Vitaliy Chiley, Jonathan Frankle, Cody Blakeney, and John P. Cunningham.","author":"Biderman Dan","year":"2024","unstructured":"Dan Biderman, Jacob Portes, Jose Javier Gonzalez Ortiz, Mansheej Paul, Philip Greengard, Connor Jennings, Daniel King, Sam Havens, Vitaliy Chiley, Jonathan Frankle, Cody Blakeney, and John P. Cunningham. 2024. LoRA Learns Less and Forgets Less. arXiv:2405.09673 [cs.LG] https:\/\/arxiv.org\/abs\/2405.09673"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.44"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Junsong Chen Chongjian Ge Enze Xie Yue Wu Lewei Yao Xiaozhe Ren Zhongdao Wang Ping Luo Huchuan Lu and Zhenguo Li. 2024. PixArt-\u03a3: Weak-to-Strong Training of Diffusion Transformer for 4K Text-to-Image Generation. arXiv:2403.04692 [cs.CV] https:\/\/arxiv.org\/abs\/2403.04692","DOI":"10.1007\/978-3-031-73411-3_5"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.46"},{"key":"e_1_3_2_1_7_1","unstructured":"Xiaokang Chen Zhiyu Wu Xingchao Liu Zizheng Pan Wen Liu Zhenda Xie Xingkai Yu and Chong Ruan. 2025. Janus-Pro: Unified Multimodal Understanding and Generation with Data and Model Scaling. arXiv:2501.17811 [cs.AI] https:\/\/arxiv.org\/abs\/2501.17811"},{"key":"e_1_3_2_1_8_1","volume-title":"ANOLE: An Open, Autoregressive, Native Large Multimodal Models for Interleaved Image-Text Generation. arXiv:2407.06135 [cs.CL] https:\/\/arxiv.org\/abs\/2407.06135","author":"Chern Ethan","year":"2024","unstructured":"Ethan Chern, Jiadi Su, Yan Ma, and Pengfei Liu. 2024. ANOLE: An Open, Autoregressive, Native Large Multimodal Models for Interleaved Image-Text Generation. arXiv:2407.06135 [cs.CL] https:\/\/arxiv.org\/abs\/2407.06135"},{"key":"e_1_3_2_1_9_1","unstructured":"DeepSeek-AI Daya Guo Dejian Yang and Others. 2025a. DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arXiv:2501.12948 [cs.CL] https:\/\/arxiv.org\/abs\/2501.12948"},{"key":"e_1_3_2_1_10_1","unstructured":"DeepSeek-AI Aixin Liu Bei Feng and Others. 2025b. DeepSeek-V3 Technical Report. arXiv:2412.19437 [cs.CL] https:\/\/arxiv.org\/abs\/2412.19437"},{"key":"e_1_3_2_1_11_1","unstructured":"Hugging Face. 2025. Stability AI \/ SDXL Turbo. https:\/\/huggingface.co\/stabilityai\/sdxl-turbo Accessed: 2025-04-02."},{"key":"e_1_3_2_1_12_1","unstructured":"Jiahui Gao Renjie Pi Jipeng Zhang Jiacheng Ye Wanjun Zhong Yufei Wang Lanqing Hong Jianhua Han Hang Xu Zhenguo Li and Lingpeng Kong. 2023. G-LLaVA: Solving Geometric Problem with Multi-Modal Large Language Model. arXiv:2312.11370 [cs.CL] https:\/\/arxiv.org\/abs\/2312.11370"},{"key":"e_1_3_2_1_13_1","unstructured":"Yuying Ge Sijie Zhao Jinguo Zhu Yixiao Ge Kun Yi Lin Song Chen Li Xiaohan Ding and Ying Shan. 2025. SEED-X: Multimodal Models with Unified Multi-granularity Comprehension and Generation. arXiv:2404.14396 [cs.CV] https:\/\/arxiv.org\/abs\/2404.14396"},{"key":"e_1_3_2_1_14_1","unstructured":"GeoGebra Team. 2024. GeoGebra. https:\/\/www.geogebra.org\/."},{"key":"e_1_3_2_1_15_1","unstructured":"Edward J. Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang and Weizhu Chen. 2021. LoRA: Low-Rank Adaptation of Large Language Models. arXiv:2106.09685 [cs.CL] https:\/\/arxiv.org\/abs\/2106.09685"},{"key":"e_1_3_2_1_16_1","volume-title":"Jesse Michael Han, and Daniel Selsam","author":"Krueger Ryan","year":"2021","unstructured":"Ryan Krueger, Jesse Michael Han, and Daniel Selsam. 2021. Automatically Building Diagrams for Olympiad Geometry Problems. In CADE. Springer International Publishing, Cham, 577-588."},{"key":"e_1_3_2_1_17_1","first-page":"34892","volume-title":"Levine (Eds.)","volume":"36","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual Instruction Tuning. In Advances in Neural Information Processing Systems, A. Oh, T. Naumann, A. Globerson, K. Saenko, M. Hardt, and S. Levine (Eds.), Vol. 36. Curran Associates, Inc., 34892-34916. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/6dcf277ea32ce3288914faf369fe6de0-Paper-Conference.pdf"},{"key":"e_1_3_2_1_18_1","unstructured":"Jiasen Lu Christopher Clark Rowan Zellers Roozbeh Mottaghi and Aniruddha Kembhavi. 2022. Unified-IO: A Unified Model for Vision Language and Multi-Modal Tasks. arXiv:2206.08916 [cs.CV] https:\/\/arxiv.org\/abs\/2206.08916"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.528"},{"key":"e_1_3_2_1_20_1","unstructured":"Zhuoyan Luo Fengyuan Shi Yixiao Ge Yujiu Yang Limin Wang and Ying Shan. 2025. Open-MAGVIT2: An Open-Source Project Toward Democratizing Auto-regressive Visual Generation. arXiv:2409.04410 [cs.CV] https:\/\/arxiv.org\/abs\/2409.04410"},{"key":"e_1_3_2_1_21_1","unstructured":"Chuofan Ma Yi Jiang Junfeng Wu Jihan Yang Xin Yu Zehuan Yuan Bingyue Peng and Xiaojuan Qi. 2025. UniTok: A Unified Tokenizer for Visual Generation and Understanding. arXiv:2502.20321 [cs.CV] https:\/\/arxiv.org\/abs\/2502.20321"},{"key":"e_1_3_2_1_22_1","unstructured":"Microsoft. 2025. Phi-4-Mini Technical Report: Compact yet Powerful Multimodal Language Models via Mixture-of-LoRAs. arXiv:2503.01743 [cs.CL] https:\/\/arxiv.org\/abs\/2503.01743"},{"key":"e_1_3_2_1_23_1","unstructured":"OpenAI. 2025. Introducing 4.0 Image Generation. https:\/\/openai.com\/index\/introducing-4o-image-generation\/"},{"key":"e_1_3_2_1_24_1","unstructured":"PaddlePaddle. 2025. PaddleOCR. https:\/\/github.com\/PaddlePaddle\/PaddleOCR"},{"key":"e_1_3_2_1_25_1","unstructured":"Aditya Ramesh Prafulla Dhariwal Alex Nichol Casey Chu and Mark Chen. 2022. Hierarchical Text-Conditional Image Generation with CLIP Latents. arXiv:2204.06125 [cs.CV] https:\/\/arxiv.org\/abs\/2204.06125"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00368"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2022. High-Resolution Image Synthesis with Latent Diffusion Models. arXiv:2112.10752 [cs.CV] https:\/\/arxiv.org\/abs\/2112.10752","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_28_1","unstructured":"Zhihong Shao Peiyi Wang Qihao Zhu Runxin Xu Junxiao Song Xiao Bi Haowei Zhang Mingchuan Zhang Y. K. Li Y. Wu and Daya Guo. 2024. DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models. arXiv:2402.03300 [cs.CL] https:\/\/arxiv.org\/abs\/2402.03300"},{"key":"e_1_3_2_1_29_1","volume-title":"Emu: Generative Pretraining in Multimodality. arXiv:2307.05222 [cs.CV] https:\/\/arxiv.org\/abs\/2307.05222","author":"Sun Quan","year":"2024","unstructured":"Quan Sun, Qiying Yu, Yufeng Cui, Fan Zhang, Xiaosong Zhang, Yueze Wang, Hongcheng Gao, Jingjing Liu, Tiejun Huang, and Xinlong Wang. 2024. Emu: Generative Pretraining in Multimodality. arXiv:2307.05222 [cs.CV] https:\/\/arxiv.org\/abs\/2307.05222"},{"key":"e_1_3_2_1_30_1","volume-title":"Chameleon: Mixed-Modal Early-Fusion Foundation Models. arXiv:2405.09818 [cs.CL] https:\/\/arxiv.org\/abs\/2405.09818","author":"Team Chameleon","year":"2025","unstructured":"Chameleon Team. 2025. Chameleon: Mixed-Modal Early-Fusion Foundation Models. arXiv:2405.09818 [cs.CL] https:\/\/arxiv.org\/abs\/2405.09818"},{"key":"e_1_3_2_1_31_1","volume-title":"QVQ: To See the World with Wisdom. https:\/\/qwenlm.github.io\/blog\/qvq-72b-preview\/","author":"Team Qwen","year":"2024","unstructured":"Qwen Team. 2024. QVQ: To See the World with Wisdom. https:\/\/qwenlm.github.io\/blog\/qvq-72b-preview\/"},{"key":"e_1_3_2_1_32_1","first-page":"476","volume-title":"Nature","volume":"625","author":"Trinh Trieu H","year":"2024","unstructured":"Trieu H Trinh, Yuhuai Wu, Quoc V Le, He He, and Thang Luong. 2024. Solving olympiad geometry without human demonstrations. Nature, Vol. 625, 7995 (2024), 476-482."},{"key":"e_1_3_2_1_33_1","unstructured":"Junxiao Wang Ting Zhang Heng Yu Jingdong Wang and Hua Huang. 2025. MagicGeo: Training-Free Text-Guided Geometric Diagram Generation. arXiv:2502.13855 [cs.CV] https:\/\/arxiv.org\/abs\/2502.13855"},{"key":"e_1_3_2_1_34_1","unstructured":"Xinlong Wang Xiaosong Zhang Zhengxiong Luo Quan Sun Yufeng Cui Jinsheng Wang Fan Zhang Yueze Wang Zhen Li Qiying Yu Yingli Zhao Yulong Ao Xuebin Min Tao Li Boya Wu Bo Zhao Bowen Zhang Liangdong Wang Guang Liu Zheqi He Xi Yang Jingjing Liu Yonghua Lin Tiejun Huang and Zhongyuan Wang. 2024. Emu3: Next-Token Prediction is All You Need. arXiv:2409.18869 [cs.CV] https:\/\/arxiv.org\/abs\/2409.18869"},{"key":"e_1_3_2_1_35_1","unstructured":"Renqiu Xia Mingsheng Li Hancheng Ye Wenjie Wu Hongbin Zhou Jiakang Yuan Tianshuo Peng Xinyu Cai Xiangchao Yan Bin Wang Conghui He Botian Shi Tao Chen Junchi Yan and Bo Zhang. 2025. GeoX: Geometric Problem Solving Through Unified Formalized Vision-Language Pre-training. arXiv:2412.11863 [cs.CV] https:\/\/arxiv.org\/abs\/2412.11863"},{"key":"e_1_3_2_1_36_1","volume-title":"Weihao Wang, Kevin Qinghong Lin, Yuchao Gu, Zhijie Chen, Zhenheng Yang, and Mike Zheng Shou.","author":"Xie Jinheng","year":"2024","unstructured":"Jinheng Xie, Weijia Mao, Zechen Bai, David Junhao Zhang, Weihao Wang, Kevin Qinghong Lin, Yuchao Gu, Zhijie Chen, Zhenheng Yang, and Mike Zheng Shou. 2024. Show-o: One Single Transformer to Unify Multimodal Understanding and Generation. arXiv:2408.12528 [cs.CV] https:\/\/arxiv.org\/abs\/2408.12528"},{"key":"e_1_3_2_1_37_1","unstructured":"An Yang Beichen Zhang Binyuan Hui Bofei Gao Bowen Yu Chengpeng Li Dayiheng Liu Jianhong Tu Jingren Zhou Junyang Lin Keming Lu Mingfeng Xue Runji Lin Tianyu Liu Xingzhang Ren and Zhenru Zhang. 2024. Qwen2.5-Math Technical Report: Toward Mathematical Expert Model via Self-Improvement. arXiv:2409.12122 [cs.CL] https:\/\/arxiv.org\/abs\/2409.12122"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Ming-Liang Zhang Fei Yin and Cheng-Lin Liu. 2023. A Multi-Modal Neural Geometric Solver with Textual Clauses Parsed from Diagram. arXiv:2302.11097 [cs.AI] https:\/\/arxiv.org\/abs\/2302.11097","DOI":"10.24963\/ijcai.2023\/376"},{"key":"e_1_3_2_1_39_1","unstructured":"Xiaokai Zhang Na Zhu Yiming He Jia Zou Qike Huang Xiaoxiao Jin Yanjun Guo Chenyang Mao Yang Li Zhe Zhu Dengfeng Yue Fangzhen Zhu Yifan Wang Yiwen Huang Runan Wang Cheng Qin Zhenbing Zeng Shaorong Xie Xiangfeng Luo and Tuo Leng. 2024a. FormalGeo: An Extensible Formalized Framework for Olympiad Geometric Problem Solving. arXiv:2310.18021 [cs.AI] https:\/\/arxiv.org\/abs\/2310.18021"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.3390\/sym16040404"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10889286"},{"key":"e_1_3_2_1_42_1","volume-title":"QLIP: Text-Aligned Visual Tokenization Unifies Auto-Regressive Multimodal Understanding and Generation. arXiv:2502.05178 [cs.CV] https:\/\/arxiv.org\/abs\/2502.05178","author":"Zhao Yue","year":"2025","unstructured":"Yue Zhao, Fuzhao Xue, Scott Reed, Linxi Fan, Yuke Zhu, Jan Kautz, Zhiding Yu, Philipp Kr\u00e4henb\u00fchl, and De-An Huang. 2025. QLIP: Text-Aligned Visual Tokenization Unifies Auto-Regressive Multimodal Understanding and Generation. arXiv:2502.05178 [cs.CV] https:\/\/arxiv.org\/abs\/2502.05178"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.3390\/sym1701000810.3390\/sym17010008"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754965","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:16:00Z","timestamp":1765340160000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754965"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":43,"alternative-id":["10.1145\/3746027.3754965","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754965","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}