{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:16:55Z","timestamp":1778080615208,"version":"3.51.4"},"publisher-location":"Cham","reference-count":38,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032046260","type":"print"},{"value":"9783032046277","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T00:00:00Z","timestamp":1757980800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T00:00:00Z","timestamp":1757980800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-04627-7_30","type":"book-chapter","created":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T02:09:59Z","timestamp":1757988599000},"page":"523-537","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["RefChartQA: Grounding Visual Answer on\u00a0Chart Images Through Instruction Tuning"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-4357-8525","authenticated-orcid":false,"given":"Alexander","family":"Vogel","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4227-8417","authenticated-orcid":false,"given":"Omar","family":"Moured","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3670-4567","authenticated-orcid":false,"given":"Yufan","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3471-328X","authenticated-orcid":false,"given":"Jiaming","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8046-4945","authenticated-orcid":false,"given":"Rainer","family":"Stiefelhagen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,9,16]]},"reference":[{"key":"30_CR1","unstructured":"Bai, S., et al.: Qwen2.5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)"},{"key":"30_CR2","unstructured":"Beyer, L., et al.: Paligemma: a versatile 3B VLM for transfer (2024)"},{"key":"30_CR3","unstructured":"Bolya, D., Fu, C.Y., Dai, X., Zhang, P., Feichtenhofer, C., Hoffman, J.: Token merging: your VIT but faster. arXiv preprint arXiv:2210.09461 (2022)"},{"key":"30_CR4","doi-asserted-by":"crossref","unstructured":"Chen, C., Anjum, S., Gurari, D.: Grounding answers for visual questions asked by visually impaired people. In: Proceedings of the IEEE\/CVF CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01851"},{"key":"30_CR5","doi-asserted-by":"crossref","unstructured":"Chen, C., Anjum, S., Gurari, D.: VQA therapy: exploring answer differences by visually grounding answers. In: Proceedings of the IEEE\/CVF ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01405"},{"key":"30_CR6","unstructured":"Chen, T., Saxena, S., Li, L., Fleet, D.J., Hinton, G.: Pix2seq: a language modeling framework for object detection. arXiv preprint arXiv:2109.10852 (2021)"},{"key":"30_CR7","unstructured":"Chen, T., Saxena, S., Li, L., Lin, T.Y., Fleet, D.J., Hinton, G.: A unified sequence interface for vision tasks. arXiv preprint arXiv:2206.07669 (2022)"},{"key":"30_CR8","unstructured":"Ebrahimi\u00a0Kahou, S., Atkinson, A., Michalski, V., K\u00e1d\u00e1r, \u00c1., Trischler, A., Bengio, Y.: Figureqa: an annotated figure dataset for visual reasoning. CoRR abs\/1710.07300 (2017)"},{"key":"30_CR9","unstructured":"Han, Y., et al.: ChartLlama: A Multimodal LLM for Chart Understanding and Generation. arXiv preprint arXiv:2311.16483 (2023)"},{"key":"30_CR10","unstructured":"He, S., Ding, H., Liu, C., Jiang, X.: Grec: generalized referring expression comprehension. arXiv preprint arXiv:2308.16182 (2023)"},{"key":"30_CR11","doi-asserted-by":"crossref","unstructured":"Hu, A., et al.: mPLUG-DocOwl 1.5: unified structure learning for OCR-free document understanding. In: Findings of ACL: EMNLP 2024 (2024)","DOI":"10.18653\/v1\/2024.findings-emnlp.175"},{"key":"30_CR12","unstructured":"Hu, E.J., et al.: Lora: low-rank adaptation of large language models. In: ICLR, vol. 1, no. 2, p. 3 (2022)"},{"key":"30_CR13","doi-asserted-by":"crossref","unstructured":"Huang, M., et al.: Evochart: a benchmark and a self-training approach towards real-world chart understanding (2024)","DOI":"10.1609\/aaai.v39i4.32383"},{"key":"30_CR14","doi-asserted-by":"crossref","unstructured":"Kafle, K., Price, B., Cohen, S., Kanan, C.: DVQA: understanding data visualizations via question answering. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00592"},{"key":"30_CR15","doi-asserted-by":"crossref","unstructured":"Kantharaj, S., et al.: Chart-to-text: a large-scale benchmark for chart summarization. In: Muresan, S., Nakov, P., Villavicencio, A. (eds.) Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), Dublin, Ireland, pp. 4005\u20134023. Association for Computational Linguistics (2022)","DOI":"10.18653\/v1\/2022.acl-long.277"},{"key":"30_CR16","doi-asserted-by":"crossref","unstructured":"Khoshsirat, S., Kambhamettu, C.: Sentence attention blocks for answer grounding. In: Proceedings of the IEEE\/CVF ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00559"},{"key":"30_CR17","doi-asserted-by":"crossref","unstructured":"Kim, G., et al.: OCR-free document understanding transformer. In: European Conference on Computer Vision (ECCV) (2022)","DOI":"10.1007\/978-3-031-19815-1_29"},{"key":"30_CR18","unstructured":"Lee, K., et al.: Pix2struct: screenshot parsing as pretraining for visual language understanding. In: Proceedings of the 40th International Conference on Machine Learning, ICML 2023. JMLR.org (2023)"},{"key":"30_CR19","doi-asserted-by":"crossref","unstructured":"Li, Z., Jasani, B., Tang, P., Ghadar, S.: Synthesize step-by-step: tools templates and LLMs as data generators for reasoning-based chart VQA. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13613\u201313623 (2024)","DOI":"10.1109\/CVPR52733.2024.01292"},{"issue":"11","key":"30_CR20","doi-asserted-by":"publisher","first-page":"2061","DOI":"10.3390\/electronics13112061","volume":"13","author":"Z Lin","year":"2024","unstructured":"Lin, Z., Chen, L., Chen, Y., Su, L.: LCV2: a universal pretraining-free framework for grounded visual question answering. Electronics 13(11), 2061 (2024)","journal-title":"Electronics"},{"key":"30_CR21","doi-asserted-by":"crossref","unstructured":"Liu, F., et al.: Deplot: one-shot visual language reasoning by plot-to-table translation. In: Findings of the Association for Computational Linguistics: ACL 2023, pp. 10381\u201310399 (2023)","DOI":"10.18653\/v1\/2023.findings-acl.660"},{"key":"30_CR22","unstructured":"Liu, F., et al.: Matcha: enhancing visual language pretraining with math reasoning and chart derendering. arXiv preprint arXiv:2212.09662 (2022)"},{"key":"30_CR23","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeurIPS (2023)"},{"key":"30_CR24","doi-asserted-by":"crossref","unstructured":"Liu, Y., Mishra, N., Sieb, M., Shentu, Y., Abbeel, P., Chen, X.: Autoregressive uncertainty modeling for 3D bounding box prediction. In: European Conference on Computer Vision, pp. 673\u2013694. Springer, Cham (2022)","DOI":"10.1007\/978-3-031-20080-9_39"},{"key":"30_CR25","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: Vilbert: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Advances in Neural Information Processing Systems (NeurIPS), vol.\u00a032 (2019)"},{"key":"30_CR26","doi-asserted-by":"crossref","unstructured":"Lu, J., et al.: A bounding box is worth one token: interleaving layout and text in a large language model for document understanding. arXiv preprint arXiv:2407.01976 (2024)","DOI":"10.18653\/v1\/2025.findings-acl.379"},{"key":"30_CR27","doi-asserted-by":"crossref","unstructured":"Ma, C., Jiang, Y., Wu, J., Yuan, Z., Qi, X.: Groma: localized visual tokenization for grounding multimodal large language models. arXiv preprint arXiv:2404.13013 (2024)","DOI":"10.1007\/978-3-031-72658-3_24"},{"key":"30_CR28","doi-asserted-by":"crossref","unstructured":"Masry, A., Do, X.L., Tan, J.Q., Joty, S., Hoque, E.: ChartQA: a benchmark for question answering about charts with visual and logical reasoning. In: Muresan, S., Nakov, P., Villavicencio, A. (eds.) Findings of ACL: ACL 2022, Dublin, Ireland, pp. 2263\u20132279. Association for Computational Linguistics (2022)","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"30_CR29","doi-asserted-by":"crossref","unstructured":"Masry, A., Kavehzadeh, P., Do, X.L., Hoque, E., Joty, S.: UniChart: a universal vision-language pretrained model for chart comprehension and reasoning. In: Bouamor, H., Pino, J., Bali, K. (eds.) Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, Singapore, pp. 14662\u201314684. Association for Computational Linguistics (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.906"},{"key":"30_CR30","unstructured":"Masry, A., Thakkar, M., Bajaj, A., Kartha, A., Hoque, E., Joty, S.: ChartGemma: visual instruction-tuning for chart reasoning in the wild. In: Rambow, O., et al. (eds.) Proceedings of the 31st International Conference on Computational Linguistics: Industry Track, Abu Dhabi, UAE, pp. 625\u2013643. Association for Computational Linguistics (2025)"},{"key":"30_CR31","doi-asserted-by":"crossref","unstructured":"Methani, N., Ganguly, P., Khapra, M.M., Kumar, P.: Plotqa: reasoning over scientific plots. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV) (2020)","DOI":"10.1109\/WACV45572.2020.9093523"},{"key":"30_CR32","unstructured":"Steiner, A., et al.: Paligemma 2: a family of versatile VLMs for transfer (2024)"},{"key":"30_CR33","unstructured":"Wang, P., et al.: Qwen2-vl: enhancing vision-language model\u2019s perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)"},{"key":"30_CR34","unstructured":"Xia, R., et al.: Chartx & chartvlm: a versatile benchmark and foundation model for complicated chart reasoning. arXiv preprint arXiv:2402.12185 (2024)"},{"key":"30_CR35","doi-asserted-by":"crossref","unstructured":"Zhang, H., et al.: Llava-grounding: grounded visual chat with large multimodal models (2023)","DOI":"10.1007\/978-3-031-72775-7_2"},{"key":"30_CR36","doi-asserted-by":"crossref","unstructured":"Zhang, L., et al.: TinyChart: efficient chart understanding with program-of-thoughts learning and visual token merging. In: Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1882\u20131898 (2024)","DOI":"10.18653\/v1\/2024.emnlp-main.112"},{"key":"30_CR37","unstructured":"Zhao, Y., Lin, Z., Zhou, D., Huang, Z., Feng, J., Kang, B.: Bubogpt: enabling visual grounding in multi-modal LLMs (2023)"},{"key":"30_CR38","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Groth, O., Bernstein, M., Fei-Fei, L.: Visual7w: grounded question answering in images. In: CVPR, pp. 4995\u20135004 (2016)","DOI":"10.1109\/CVPR.2016.540"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition \u2013 ICDAR 2025"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-04627-7_30","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T02:10:14Z","timestamp":1757988614000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-04627-7_30"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,16]]},"ISBN":["9783032046260","9783032046277"],"references-count":38,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-04627-7_30","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9,16]]},"assertion":[{"value":"16 September 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Wuhan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/iapr.org\/icdar2025","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}