{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:06:55Z","timestamp":1765339615476,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":65,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100019491","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 62303231"],"award-info":[{"award-number":["No. 62303231"]}],"id":[{"id":"10.13039\/501100019491","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Startup Foundation for Introducing Talent of NUIST","award":["No. 2024r058"],"award-info":[{"award-number":["No. 2024r058"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755731","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:55:00Z","timestamp":1761375300000},"page":"5050-5059","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Draw with Thought: Unleashing Multimodal Reasoning for Scientific Diagram Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-2384-1068","authenticated-orcid":false,"given":"Zhiqing","family":"Cui","sequence":"first","affiliation":[{"name":"Nanjing University of Information Science &amp; Technology, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6194-450X","authenticated-orcid":false,"given":"Jiahao","family":"Yuan","sequence":"additional","affiliation":[{"name":"East China Normal University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3329-2731","authenticated-orcid":false,"given":"Hanqing","family":"Wang","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4054-606X","authenticated-orcid":false,"given":"Yanshu","family":"Li","sequence":"additional","affiliation":[{"name":"Brown University, Providence, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7151-1056","authenticated-orcid":false,"given":"Chenxu","family":"Du","sequence":"additional","affiliation":[{"name":"Southwest Jiaotong University, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2949-6287","authenticated-orcid":false,"given":"Zhenglong","family":"Ding","sequence":"additional","affiliation":[{"name":"Nanjing University of Information Science &amp; Technology, Nanjing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/s13164-014-0215-2"},{"key":"e_1_3_2_1_2_1","unstructured":"Meta AI. 2024. Llama 3.2V: Vision Multimodal Large Model. Hugging Face. https:\/\/huggingface.co\/blog\/zh\/llama32"},{"key":"e_1_3_2_1_3_1","unstructured":"Christopher Gene Allen. 2011. The effects of visual complexity on cognitive load as influenced by field dependency and spatial ability. Ph.D. Dissertation. New York University."},{"key":"e_1_3_2_1_4_1","unstructured":"Anthropic. 2024. Introducing Claude 3.5 Sonnet. https:\/\/www.anthropic.com\/news\/claude-3-5-sonnet"},{"key":"e_1_3_2_1_5_1","unstructured":"Anthropic. 2025. Claude 3.7 Sonnet and Claude Code. https:\/\/www.anthropic.com\/news\/claude-3-7-sonnet"},{"key":"e_1_3_2_1_6_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2.5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_7_1","volume-title":"Longwriter: Unleashing 10,000 word generation from long context llms. arXiv preprint arXiv:2408.07055","author":"Bai Yushi","year":"2024","unstructured":"Yushi Bai, Jiajie Zhang, Xin Lv, Linzhi Zheng, Siqi Zhu, Lei Hou, Yuxiao Dong, Jie Tang, and Juanzi Li. 2024. Longwriter: Unleashing 10,000 word generation from long context llms. arXiv preprint arXiv:2408.07055 (2024)."},{"volume-title":"AutomaTikZ: Text-Guided Synthesis of Scientific Vector Graphics with TikZ. In The Twelfth International Conference on Learning Representations.","author":"Belouadi Jonas","key":"e_1_3_2_1_8_1","unstructured":"Jonas Belouadi, Anne Lauscher, and Steffen Eger. [n.d.]. AutomaTikZ: Text-Guided Synthesis of Scientific Vector Graphics with TikZ. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2701"},{"volume-title":"Nougat: Neural Optical Understanding for Academic Documents. In The Twelfth International Conference on Learning Representations.","author":"Blecher Lukas","key":"e_1_3_2_1_10_1","unstructured":"Lukas Blecher, Guillem Cucurull, Thomas Scialom, and Robert Stojnic. [n.d.]. Nougat: Neural Optical Understanding for Academic Documents. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681167"},{"key":"e_1_3_2_1_13_1","volume-title":"Vision-language models can self-improve reasoning via reflection. arXiv preprint arXiv:2411.00855","author":"Cheng Kanzhi","year":"2024","unstructured":"Kanzhi Cheng, Yantao Li, Fangzhi Xu, Jianbing Zhang, Hao Zhou, and Yang Liu. 2024. Vision-language models can self-improve reasoning via reflection. arXiv preprint arXiv:2411.00855 (2024)."},{"key":"e_1_3_2_1_14_1","first-page":"49250","volume-title":"Levine (Eds.)","volume":"36","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, DONGXU LI, Anthony Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale N Fung, and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. In Advances in Neural Information Processing Systems, A. Oh, T. Naumann, A. Globerson, K. Saenko, M. Hardt, and S. Levine (Eds.), Vol. 36. Curran Associates, Inc., 49250-49267. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/9a6a435e75419a836fe47ab6793623e6-Paper-Conference.pdf"},{"volume-title":"SVG essentials: Producing scalable vector graphics with XML. '' O'Reilly Media","author":"David Eisenberg J","key":"e_1_3_2_1_15_1","unstructured":"J David Eisenberg and Amelia Bellamy-Royds. 2014. SVG essentials: Producing scalable vector graphics with XML. '' O'Reilly Media, Inc.''."},{"key":"e_1_3_2_1_16_1","unstructured":"Rongyao Fang Chengqi Duan Kun Wang Linjiang Huang Hao Li Shilin Yan Hao Tian Xingyu Zeng Rui Zhao Jifeng Dai et al. 2025. GoT: Unleashing Reasoning Capability of Multimodal Large Language Model for Visual Generation and Editing. arXiv preprint arXiv:2503.10639 (2025)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/2897824.2925946"},{"volume-title":"Handbook of choice modelling","author":"Flynn Terry N","key":"e_1_3_2_1_18_1","unstructured":"Terry N Flynn and Anthony AJ Marley. 2014. Best-worst scaling: theory and methods. In Handbook of choice modelling. Edward Elgar Publishing, 178-201."},{"volume-title":"InCoder: A Generative Model for Code Infilling and Synthesis. In The Eleventh International Conference on Learning Representations.","author":"Fried Daniel","key":"e_1_3_2_1_19_1","unstructured":"Daniel Fried, Armen Aghajanyan, Jessy Lin, Sida Wang, Eric Wallace, Freda Shi, Ruiqi Zhong, Scott Yih, Luke Zettlemoyer, and Mike Lewis. [n.d.]. InCoder: A Generative Model for Code Infilling and Synthesis. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_20_1","volume-title":"Drawing Pandas: A Benchmark for LLMs in Generating Plotting Code. arXiv preprint arXiv:2412.02764","author":"Galimzyanov Timur","year":"2024","unstructured":"Timur Galimzyanov, Sergey Titov, Yaroslav Golubev, and Egor Bogomolov. 2024. Drawing Pandas: A Benchmark for LLMs in Generating Plotting Code. arXiv preprint arXiv:2412.02764 (2024)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681249"},{"key":"e_1_3_2_1_22_1","volume-title":"Structure-mapping: A theoretical framework for analogy. Cognitive science","author":"Gentner Dedre","year":"1983","unstructured":"Dedre Gentner. 1983. Structure-mapping: A theoretical framework for analogy. Cognitive science, Vol. 7, 2 (1983), 155-170."},{"key":"e_1_3_2_1_23_1","unstructured":"Google. 2024. Try our newest 2.0 Experimental Advanced model in Gemini Advanced. https:\/\/blog.google\/feed\/gemini-exp-1206"},{"key":"e_1_3_2_1_24_1","volume-title":"Code Llama: Open Foundation Models for Code. arXiv preprint arXiv:2308.12950","author":"Grattafiori Wenhan Xiong","year":"2023","unstructured":"Wenhan Xiong Grattafiori, Alexandre D\u00e9fossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, and Gabriel Synnaeve. 2023. Code Llama: Open Foundation Models for Code. arXiv preprint arXiv:2308.12950 (2023)."},{"key":"e_1_3_2_1_25_1","unstructured":"Mary Hegarty Patricia A Carpenter and Marcel Adam Just. 1991. Diagrams in the comprehension of scientific texts. (1991)."},{"key":"e_1_3_2_1_26_1","volume-title":"Ronan Le Bras, and Yejin Choi","author":"Hessel Jack","year":"2021","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan Le Bras, and Yejin Choi. 2021. Clipscore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)."},{"key":"e_1_3_2_1_27_1","volume-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_28_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. Gpt-4o system card. arXiv preprint arXiv:2410.21276 (2024)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Josh Kaplan and Luis Rabelo. 2024. Bridging the Gap: Leveraging Informal Software Architecture Artifacts for Structured Model Creation. (2024).","DOI":"10.20944\/preprints202405.0305.v1"},{"volume-title":"The Palgrave handbook of survey research","author":"Krosnick Jon A","key":"e_1_3_2_1_30_1","unstructured":"Jon A Krosnick. 2017. Questionnaire design. In The Palgrave handbook of survey research. Springer, 439-455."},{"key":"e_1_3_2_1_31_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_32_1","volume-title":"MMCode: Benchmarking Multimodal Large Language Models for Code Generation with Visually Rich Programming Problems. arXiv preprint arXiv:2404.09486","author":"Li Kaixin","year":"2024","unstructured":"Kaixin Li, Yuchen Tian, Qisheng Hu, Ziyang Luo, Zhiyong Huang, and Jing Ma. 2024. MMCode: Benchmarking Multimodal Large Language Models for Code Generation with Visually Rich Programming Problems. arXiv preprint arXiv:2404.09486 (2024)."},{"key":"e_1_3_2_1_33_1","unstructured":"Raymond Li Yangtian Zi Niklas Muennighoff Denis Kocetkov Chenghao Mou Marc Marone Christopher Akiki LI Jia Jenny Chim Qian Liu et al. [n.d.]. StarCoder: may the source be with you! Transactions on Machine Learning Research ([n.d.])."},{"key":"e_1_3_2_1_34_1","volume-title":"Scigraphqa: A large-scale synthetic multi-turn question-answering dataset for scientific graphs. arXiv preprint arXiv:2308.03349","author":"Li Shengzhi","year":"2023","unstructured":"Shengzhi Li and Nima Tajbakhsh. 2023. Scigraphqa: A large-scale synthetic multi-turn question-answering dataset for scientific graphs. arXiv preprint arXiv:2308.03349 (2023)."},{"volume-title":"Best-worst scaling: Theory, methods and applications","author":"Louviere Jordan J","key":"e_1_3_2_1_35_1","unstructured":"Jordan J Louviere, Terry N Flynn, and Anthony Alfred John Marley. 2015. Best-worst scaling: Theory, methods and applications. Cambridge University Press."},{"key":"e_1_3_2_1_36_1","volume-title":"Fantastically ordered prompts and where to find them: Overcoming few-shot prompt order sensitivity. arXiv preprint arXiv:2104.08786","author":"Lu Yao","year":"2021","unstructured":"Yao Lu, Max Bartolo, Alastair Moore, Sebastian Riedel, and Pontus Stenetorp. 2021. Fantastically ordered prompts and where to find them: Overcoming few-shot prompt order sensitivity. arXiv preprint arXiv:2104.08786 (2021)."},{"key":"e_1_3_2_1_37_1","first-page":"46534","article-title":"Self-refine: Iterative refinement with self-feedback","volume":"36","author":"Madaan Aman","year":"2023","unstructured":"Aman Madaan, Niket Tandon, Prakhar Gupta, Skyler Hallinan, Luyu Gao, Sarah Wiegreffe, Uri Alon, Nouha Dziri, Shrimai Prabhumoye, Yiming Yang, et al., 2023. Self-refine: Iterative refinement with self-feedback. Advances in Neural Information Processing Systems, Vol. 36 (2023), 46534-46594.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1002\/9781118445112.stat02802"},{"key":"e_1_3_2_1_39_1","unstructured":"OpenAI. 2024. Hello GPT-4o. https:\/\/openai.com\/index\/hello-gpt-4o"},{"key":"e_1_3_2_1_40_1","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni Huy Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby et al. 2023. Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Jan L Plass Roxana Moreno and Roland Br\u00fcnken. 2010. Cognitive load theory. (2010).","DOI":"10.1017\/CBO9780511844744"},{"key":"e_1_3_2_1_42_1","first-page":"114843","article-title":"Agent planning with world knowledge model","volume":"37","author":"Qiao Shuofei","year":"2024","unstructured":"Shuofei Qiao, Runnan Fang, Ningyu Zhang, Yuqi Zhu, Xiang Chen, Shumin Deng, Yong Jiang, Pengjun Xie, Fei Huang, and Huajun Chen. 2024. Agent planning with world knowledge model. Advances in Neural Information Processing Systems, Vol. 37 (2024), 114843-114871.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_43_1","first-page":"115058","article-title":"Image2struct: Benchmarking structure extraction for vision-language models","volume":"37","author":"Roberts Josselin","year":"2024","unstructured":"Josselin Roberts, Tony Lee, Chi Heem Wong, Michihiro Yasunaga, Yifan Mai, and Percy S Liang. 2024. Image2struct: Benchmarking structure extraction for vision-language models. Advances in Neural Information Processing Systems, Vol. 37 (2024), 115058-115097.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_44_1","volume-title":"Starvector: Generating scalable vector graphics code from images. arXiv preprint arXiv:2312.11556","author":"Rodriguez Juan A","year":"2023","unstructured":"Juan A Rodriguez, Shubham Agarwal, Issam H Laradji, Pau Rodriguez, David Vazquez, Christopher Pal, and Marco Pedersoli. 2023. Starvector: Generating scalable vector graphics code from images. arXiv preprint arXiv:2312.11556 (2023)."},{"key":"e_1_3_2_1_45_1","unstructured":"Christoph Schuhmann. 2022. Improved Aesthetic Predictor. https:\/\/github.com\/christophschuhmann\/improved-aesthetic-predictor."},{"key":"e_1_3_2_1_46_1","volume-title":"Design2code: How far are we from automating front-end engineering? arXiv e-prints","author":"Si Chenglei","year":"2024","unstructured":"Chenglei Si, Yanzhe Zhang, Zhengyuan Yang, Ruibo Liu, and Diyi Yang. 2024. Design2code: How far are we from automating front-end engineering? arXiv e-prints (2024), arXiv-2403."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.421"},{"key":"e_1_3_2_1_48_1","volume-title":"LayerTracer: Cognitive-Aligned Layered SVG Synthesis via Diffusion Transformer. arXiv preprint arXiv:2502.01105","author":"Song Yiren","year":"2025","unstructured":"Yiren Song, Danze Chen, and Mike Zheng Shou. 2025. LayerTracer: Cognitive-Aligned Layered SVG Synthesis via Diffusion Transformer. arXiv preprint arXiv:2502.01105 (2025)."},{"key":"e_1_3_2_1_49_1","volume-title":"Abubakar Abid, Adam Fisch, Adam R Brown, Adam Santoro, Aditya Gupta, Adri\u00e0 Garriga-Alonso, et al.","author":"Srivastava Aarohi","year":"2022","unstructured":"Aarohi Srivastava, Abhinav Rastogi, Abhishek Rao, Abu Awal Md Shoeb, Abubakar Abid, Adam Fisch, Adam R Brown, Adam Santoro, Aditya Gupta, Adri\u00e0 Garriga-Alonso, et al., 2022. Beyond the imitation game: Quantifying and extrapolating the capabilities of language models. arXiv preprint arXiv:2206.04615 (2022)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.147"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3478513.3480488","article-title":"Deepvecfont: synthesizing high-quality vector fonts via dual-modality learning","volume":"40","author":"Wang Yizhi","year":"2021","unstructured":"Yizhi Wang and Zhouhui Lian. 2021. Deepvecfont: synthesizing high-quality vector fonts via dual-modality learning. ACM Transactions on Graphics (TOG), Vol. 40, 6 (2021), 1-15.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3395027.3419580"},{"key":"e_1_3_2_1_53_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al., 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, Vol. 35 (2022), 24824-24837."},{"key":"e_1_3_2_1_54_1","volume-title":"Plot2code: A comprehensive benchmark for evaluating multi-modal large language models in code generation from scientific plots. arXiv preprint arXiv:2405.07990","author":"Wu Chengyue","year":"2024","unstructured":"Chengyue Wu, Yixiao Ge, Qiushan Guo, Jiahao Wang, Zhixuan Liang, Zeyu Lu, Ying Shan, and Ping Luo. 2024a. Plot2code: A comprehensive benchmark for evaluating multi-modal large language models in code generation from scientific plots. arXiv preprint arXiv:2405.07990 (2024)."},{"key":"e_1_3_2_1_55_1","unstructured":"Jialong Wu Wenbiao Yin Yong Jiang Zhenglin Wang Zekun Xi Runnan Fang Linhai Zhang Yulan He Deyu Zhou Pengjun Xie et al. 2025. WebWalker: Benchmarking LLMs in Web Traversal. arXiv preprint arXiv:2501.07572 (2025)."},{"key":"e_1_3_2_1_56_1","volume-title":"Lotlip: Improving language-image pre-training for long text understanding. arXiv preprint arXiv:2410.05249","author":"Wu Wei","year":"2024","unstructured":"Wei Wu, Kecheng Zheng, Shuailei Ma, Fan Lu, Yuxin Guo, Yifei Zhang, Wei Chen, Qingpei Guo, Yujun Shen, and Zheng-Jun Zha. 2024b. Lotlip: Improving language-image pre-training for long text understanding. arXiv preprint arXiv:2410.05249 (2024)."},{"key":"e_1_3_2_1_57_1","first-page":"29680","article-title":"Symbol-LLM: leverage language models for symbolic system in visual human activity reasoning","volume":"36","author":"Wu Xiaoqian","year":"2023","unstructured":"Xiaoqian Wu, Yong-Lu Li, Jianhua Sun, and Cewu Lu. 2023. Symbol-LLM: leverage language models for symbolic system in visual human activity reasoning. Advances in Neural Information Processing Systems, Vol. 36 (2023), 29680-29691.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_58_1","unstructured":"xAI. 2025. Grok 3 Beta - The Age of Reasoning Agents. https:\/\/x.ai\/blog\/grok-3. Accessed: 2025-02-21."},{"key":"e_1_3_2_1_59_1","volume-title":"WuKong: A Large Multimodal Model for Efficient Long PDF Reading with End-to-End Sparse Sampling. arXiv preprint arXiv:2410.05970","author":"Xie Xudong","year":"2024","unstructured":"Xudong Xie, Hao Yan, Liang Yin, Yang Liu, Jing Ding, Minghui Liao, Yuliang Liu, Wei Chen, and Xiang Bai. 2024. WuKong: A Large Multimodal Model for Efficient Long PDF Reading with End-to-End Sparse Sampling. arXiv preprint arXiv:2410.05970 (2024)."},{"key":"e_1_3_2_1_60_1","volume-title":"Empowering LLMs to Understand and Generate Complex Vector Graphics. arXiv preprint arXiv:2412.11102","author":"Xing Ximing","year":"2024","unstructured":"Ximing Xing, Juncheng Hu, Guotao Liang, Jing Zhang, Dong Xu, and Qian Yu. 2024. Empowering LLMs to Understand and Generate Complex Vector Graphics. arXiv preprint arXiv:2412.11102 (2024)."},{"key":"e_1_3_2_1_61_1","volume-title":"Chartmimic: Evaluating lmm's cross-modal reasoning capability via chart-to-code generation. arXiv preprint arXiv:2406.09961","author":"Yang Cheng","year":"2024","unstructured":"Cheng Yang, Chufan Shi, Yaxin Liu, Bo Shui, Junjie Wang, Mohan Jing, Linran Xu, Xinyu Zhu, Siheng Li, Yuxiang Zhang, et al., 2024. Chartmimic: Evaluating lmm's cross-modal reasoning capability via chart-to-code generation. arXiv preprint arXiv:2406.09961 (2024)."},{"volume-title":"ReAct: Synergizing Reasoning and Acting in Language Models. In The Eleventh International Conference on Learning Representations.","author":"Yao Shunyu","key":"e_1_3_2_1_62_1","unstructured":"Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik R Narasimhan, and Yuan Cao. [n.d.]. ReAct: Synergizing Reasoning and Acting in Language Models. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_63_1","volume-title":"LongProc: Benchmarking Long-Context Language Models on Long Procedural Generation. arXiv preprint arXiv:2501.05414","author":"Ye Xi","year":"2025","unstructured":"Xi Ye, Fangcong Yin, Yinghui He, Joie Zhang, Howard Yen, Tianyu Gao, Greg Durrett, and Danqi Chen. 2025. LongProc: Benchmarking Long-Context Language Models on Long Procedural Generation. arXiv preprint arXiv:2501.05414 (2025)."},{"key":"e_1_3_2_1_64_1","volume-title":"Fahimeh Moafian, and Zhixue Zhao.","author":"Zhang Leixin","year":"2024","unstructured":"Leixin Zhang, Steffen Eger, Yinjie Cheng, Weihe Zhai, Jonas Belouadi, Christoph Leiter, Simone Paolo Ponzetto, Fahimeh Moafian, and Zhixue Zhao. 2024. ScImage: How good are multimodal large language models at scientific text-to-image generation? arXiv preprint arXiv:2412.02368 (2024)."},{"key":"e_1_3_2_1_65_1","volume-title":"ChartCoder: Advancing Multimodal Large Language Model for Chart-to-Code Generation. arXiv preprint arXiv:2501.06598","author":"Zhao Xuanle","year":"2025","unstructured":"Xuanle Zhao, Xianzhen Luo, Qi Shi, Chi Chen, Shuo Wang, Wanxiang Che, Zhiyuan Liu, and Maosong Sun. 2025. ChartCoder: Advancing Multimodal Large Language Model for Chart-to-Code Generation. arXiv preprint arXiv:2501.06598 (2025)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755731","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:02:50Z","timestamp":1765339370000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755731"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":65,"alternative-id":["10.1145\/3746027.3755731","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755731","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}