{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T14:12:54Z","timestamp":1780495974278,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681167","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"147-155","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":18,"title":["OneChart: Purify the Chart Structural Extraction via One Auxiliary Token"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-1821-9818","authenticated-orcid":false,"given":"Jinyue","family":"Chen","sequence":"first","affiliation":[{"name":"University of the Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9471-0278","authenticated-orcid":false,"given":"Lingyu","family":"Kong","sequence":"additional","affiliation":[{"name":"University of the Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2283-1895","authenticated-orcid":false,"given":"Haoran","family":"Wei","sequence":"additional","affiliation":[{"name":"MEGVII Technology, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0934-8161","authenticated-orcid":false,"given":"Chenglong","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8630-8270","authenticated-orcid":false,"given":"Zheng","family":"Ge","sequence":"additional","affiliation":[{"name":"MEGVII Technology, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4191-7814","authenticated-orcid":false,"given":"Liang","family":"Zhao","sequence":"additional","affiliation":[{"name":"MEGVII Technology, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1216-9626","authenticated-orcid":false,"given":"Jianjian","family":"Sun","sequence":"additional","affiliation":[{"name":"MEGVII Technology, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9725-280X","authenticated-orcid":false,"given":"Chunrui","family":"Han","sequence":"additional","affiliation":[{"name":"MEGVII Technology, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2138-4608","authenticated-orcid":false,"given":"Xiangyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"MEGVII Technology, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Reading and reasoning over chart images for evidence-based automated fact-checking. arXiv preprint arXiv:2301.11843","author":"Akhtar Mubashara","year":"2023","unstructured":"Mubashara Akhtar, Oana Cocarascu, and Elena Simperl. 2023. Reading and reasoning over chart images for evidence-based automated fact-checking. arXiv preprint arXiv:2301.11843 (2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00481"},{"key":"e_1_3_2_1_3_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"Nougat: Neural optical understanding for academic documents. arXiv preprint arXiv:2308.13418","author":"Blecher Lukas","year":"2023","unstructured":"Lukas Blecher, Guillem Cucurull, Thomas Scialom, and Robert Stojnic. 2023. Nougat: Neural optical understanding for academic documents. arXiv preprint arXiv:2308.13418 (2023)."},{"key":"e_1_3_2_1_5_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_6_1","volume-title":"Xing","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E. Gonzalez, Ion Stoica, and Eric P. Xing. 2023. Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality. https:\/\/lmsys.org\/blog\/2023-03--30-vicuna\/."},{"key":"e_1_3_2_1_7_1","unstructured":"Hyung Won Chung Le Hou Shayne Longpre Barret Zoph Yi Tay William Fedus Yunxuan Li Xuezhi Wang Mostafa Dehghani Siddhartha Brahma et al. 2022. Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)."},{"key":"e_1_3_2_1_8_1","volume-title":"Dreamllm: Synergistic multimodal comprehension and creation. arXiv preprint arXiv:2309.11499","author":"Dong Runpei","year":"2023","unstructured":"Runpei Dong, Chunrui Han, Yuang Peng, Zekun Qi, Zheng Ge, Jinrong Yang, Liang Zhao, Jianjian Sun, Hongyu Zhou, Haoran Wei, et al. 2023. Dreamllm: Synergistic multimodal comprehension and creation. arXiv preprint arXiv:2309.11499 (2023)."},{"key":"e_1_3_2_1_9_1","volume-title":"Mohammad Saeed Ehsani, and Hans-Peter Hutter","author":"Farahani Ali Mazraeh","year":"2023","unstructured":"Ali Mazraeh Farahani, Peyman Adibi, Alireza Darvishy, Mohammad Saeed Ehsani, and Hans-Peter Hutter. 2023. Automatic chart understanding: a review. IEEE Access (2023)."},{"key":"e_1_3_2_1_10_1","volume-title":"Chartllama: A multimodal llm for chart understanding and generation. arXiv preprint arXiv:2311.16483","author":"Han Yucheng","year":"2023","unstructured":"Yucheng Han, Chi Zhang, Xin Chen, Xu Yang, Zhibin Wang, Gang Yu, Bin Fu, and Hanwang Zhang. 2023. Chartllama: A multimodal llm for chart understanding and generation. arXiv preprint arXiv:2311.16483 (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"Thomas M\u00fcller, Francesco Piccinno, and Julian Martin Eisenschlos.","author":"Herzig Jonathan","year":"2020","unstructured":"Jonathan Herzig, Pawe\u0142 Krzysztof Nowak, Thomas M\u00fcller, Francesco Piccinno, and Julian Martin Eisenschlos. 2020. TaPas: Weakly supervised table parsing via pre-training. arXiv preprint arXiv:2004.02349 (2020)."},{"key":"e_1_3_2_1_12_1","volume-title":"Rixie Tiffany Ko Leong, Jia Qing Tan, Enamul Hoque, and Shafiq Joty.","author":"Kantharaj Shankar","year":"2022","unstructured":"Shankar Kantharaj, Xuan Long Do, Rixie Tiffany Ko Leong, Jia Qing Tan, Enamul Hoque, and Shafiq Joty. 2022. Opencqa: Open-ended question answering with charts. arXiv preprint arXiv:2210.06628 (2022)."},{"key":"e_1_3_2_1_13_1","volume-title":"Xiang Lin, Ahmed Masry, Megh Thakkar, Enamul Hoque, and Shafiq Joty.","author":"Kantharaj Shankar","year":"2022","unstructured":"Shankar Kantharaj, Rixie Tiffany Ko Leong, Xiang Lin, Ahmed Masry, Megh Thakkar, Enamul Hoque, and Shafiq Joty. 2022. Chart-to-text: A large-scale benchmark for chart summarization. arXiv preprint arXiv:2203.06486 (2022)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_45"},{"key":"e_1_3_2_1_15_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Lee Kenton","year":"2023","unstructured":"Kenton Lee, Mandar Joshi, Iulia Raluca Turc, Hexiang Hu, Fangyu Liu, Julian Martin Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, and Kristina Toutanova. 2023. Pix2struct: Screenshot parsing as pretraining for visual language understanding. In International Conference on Machine Learning. PMLR, 18893--18912."},{"key":"e_1_3_2_1_16_1","unstructured":"Vladimir I Levenshtein et al. 1966. Binary codes capable of correcting deletions insertions and reversals. In Soviet physics doklady Vol. 10. Soviet Union 707--710."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.660"},{"key":"e_1_3_2_1_18_1","volume-title":"Matcha: Enhancing visual language pretraining with math reasoning and chart derendering. arXiv preprint arXiv:2212.09662","author":"Liu Fangyu","year":"2022","unstructured":"Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, and Julian Martin Eisenschlos. 2022. Matcha: Enhancing visual language pretraining with math reasoning and chart derendering. arXiv preprint arXiv:2212.09662 (2022)."},{"key":"e_1_3_2_1_19_1","volume-title":"2023 d. Mmc: Advancing multimodal chart understanding with large-scale instruction tuning. arXiv preprint arXiv:2311.10774","author":"Liu Fuxiao","year":"2023","unstructured":"Fuxiao Liu, Xiaoyang Wang, Wenlin Yao, Jianshu Chen, Kaiqiang Song, Sangwoo Cho, Yaser Yacoob, and Dong Yu. 2023 d. Mmc: Advancing multimodal chart understanding with large-scale instruction tuning. arXiv preprint arXiv:2311.10774 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2023. Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)."},{"key":"e_1_3_2_1_21_1","unstructured":"Haotian Liu Chunyuan Li Yuheng Li Bo Li Yuanhan Zhang Sheng Shen and Yong Jae Lee. 2024. LLaVA-NeXT: Improved reasoning OCR and world knowledge. https:\/\/llava-vl.github.io\/blog\/2024-01--30-llava-next\/"},{"key":"e_1_3_2_1_22_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong Jae Lee. 2023. Visual Instruction Tuning."},{"key":"e_1_3_2_1_23_1","volume-title":"Data extraction from charts via single deep neural network. arXiv preprint arXiv:1906.11906","author":"Liu Xiaoyi","year":"2019","unstructured":"Xiaoyi Liu, Diego Klabjan, and Patrick NBless. 2019. Data extraction from charts via single deep neural network. arXiv preprint arXiv:1906.11906 (2019)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00196"},{"key":"e_1_3_2_1_25_1","volume-title":"Enamul Hoque, and Shafiq Joty.","author":"Masry Ahmed","year":"2023","unstructured":"Ahmed Masry, Parsa Kavehzadeh, Xuan Long Do, Enamul Hoque, and Shafiq Joty. 2023. Unichart: A universal vision-language pretrained model for chart comprehension and reasoning. arXiv preprint arXiv:2305.14761 (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"Jia Qing Tan, Shafiq Joty, and Enamul Hoque.","author":"Masry Ahmed","year":"2022","unstructured":"Ahmed Masry, Do Xuan Long, Jia Qing Tan, Shafiq Joty, and Enamul Hoque. 2022. ChartQA: A benchmark for question answering about charts with visual and logical reasoning. arXiv preprint arXiv:2203.10244 (2022)."},{"key":"e_1_3_2_1_27_1","volume-title":"Chartassisstant: A universal chart multimodal language model via chart-to-table pre-training and multitask instruction tuning. arXiv preprint arXiv:2401.02384","author":"Meng Fanqing","year":"2024","unstructured":"Fanqing Meng, Wenqi Shao, Quanfeng Lu, Peng Gao, Kaipeng Zhang, Yu Qiao, and Ping Luo. 2024. Chartassisstant: A universal chart multimodal language model via chart-to-table pre-training and multitask instruction tuning. arXiv preprint arXiv:2401.02384 (2024)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093523"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/TENSYMP54529.2022.9864560"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00446"},{"key":"e_1_3_2_1_31_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_32_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research, Vol. 21, 140 (2020), 1--67.","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/IRI51335.2021.00050"},{"key":"e_1_3_2_1_34_1","volume-title":"Vary: Scaling up the Vision Vocabulary for Large Vision-Language Models. arXiv preprint arXiv:2312.06109","author":"Wei Haoran","year":"2023","unstructured":"Haoran Wei, Lingyu Kong, Jinyue Chen, Liang Zhao, Zheng Ge, Jinrong Yang, Jianjian Sun, Chunrui Han, and Xiangyu Zhang. 2023. Vary: Scaling up the Vision Vocabulary for Large Vision-Language Models. arXiv preprint arXiv:2312.06109 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"Small Language Model Meets with Reinforced Vision Vocabulary. arXiv preprint arXiv:2401.12503","author":"Wei Haoran","year":"2024","unstructured":"Haoran Wei, Lingyu Kong, Jinyue Chen, Liang Zhao, Zheng Ge, En Yu, Jianjian Sun, Chunrui Han, and Xiangyu Zhang. 2024. Small Language Model Meets with Reinforced Vision Vocabulary. arXiv preprint arXiv:2401.12503 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"Structchart: Perception, structuring, reasoning for visual chart understanding. arXiv preprint arXiv:2309.11268","author":"Xia Renqiu","year":"2023","unstructured":"Renqiu Xia, Bo Zhang, Haoyang Peng, Ning Liao, Peng Ye, Botian Shi, Junchi Yan, and Yu Qiao. 2023. Structchart: Perception, structuring, reasoning for visual chart understanding. arXiv preprint arXiv:2309.11268 (2023)."},{"key":"e_1_3_2_1_37_1","unstructured":"Renqiu Xia Bo Zhang Hancheng Ye Xiangchao Yan Qi Liu Hongbin Zhou Zijun Chen Min Dou Botian Shi Junchi Yan and Yu Qiao. 2024. ChartX & ChartVLM: A Versatile Benchmark and Foundation Model for Complicated Chart Reasoning. arxiv: 2402.12185 [cs.CV]"},{"key":"e_1_3_2_1_38_1","volume-title":"NLPCC2019: Large-Scale Chinese Datasets for NLP. http:\/\/github.com\/brightmart\/nlp_chinese_corpus.","author":"Liang Xu.","unstructured":"Liang Xu. [n.,d.]. NLPCC2019: Large-Scale Chinese Datasets for NLP. http:\/\/github.com\/brightmart\/nlp_chinese_corpus."},{"key":"e_1_3_2_1_39_1","unstructured":"Jiabo Ye Anwen Hu Haiyang Xu Qinghao Ye Ming Yan Yuhao Dan Chenlin Zhao Guohai Xu Chenliang Li Junfeng Tian et al. 2023. mplug-docowl: Modularized multimodal large language model for document understanding. arXiv preprint arXiv:2307.02499 (2023)."},{"key":"e_1_3_2_1_40_1","volume-title":"Merlin: Empowering Multimodal LLMs with Foresight Minds. arXiv preprint arXiv:2312.00589","author":"Yu En","year":"2023","unstructured":"En Yu, Liang Zhao, Yana Wei, Jinrong Yang, Dongming Wu, Lingyu Kong, Haoran Wei, Tiancai Wang, Zheng Ge, Xiangyu Zhang, et al. 2023. Merlin: Empowering Multimodal LLMs with Foresight Minds. arXiv preprint arXiv:2312.00589 (2023)."},{"key":"e_1_3_2_1_41_1","unstructured":"Ao Zhang Wei Ji and Tat-Seng Chua. 2023. NExT-Chat: An LMM for Chat Detection and Segmentation. arxiv: 2311.04498 [cs.CV]"},{"key":"e_1_3_2_1_42_1","volume-title":"Xi Victoria Lin, et al","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, et al. 2022. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)."},{"key":"e_1_3_2_1_43_1","volume-title":"Chatspot: Bootstrapping multimodal llms via precise referring instruction tuning. arXiv preprint arXiv:2307.09474","author":"Zhao Liang","year":"2023","unstructured":"Liang Zhao, En Yu, Zheng Ge, Jinrong Yang, Haoran Wei, Hongyu Zhou, Jianjian Sun, Yuang Peng, Runpei Dong, Chunrui Han, et al. 2023. Chatspot: Bootstrapping multimodal llms via precise referring instruction tuning. arXiv preprint arXiv:2307.09474 (2023)."},{"key":"e_1_3_2_1_44_1","volume-title":"MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. arXiv preprint arXiv:2304.10592 (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681167","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681167","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:02Z","timestamp":1750295882000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681167"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":44,"alternative-id":["10.1145\/3664647.3681167","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681167","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}