{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,6]],"date-time":"2026-04-06T16:54:01Z","timestamp":1775494441912,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,2]],"date-time":"2024-05-02T00:00:00Z","timestamp":1714608000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,2]]},"DOI":"10.1145\/3613905.3650738","type":"proceedings-article","created":{"date-parts":[[2024,5,11]],"date-time":"2024-05-11T08:15:21Z","timestamp":1715415321000},"page":"1-9","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["SciCapenter: Supporting Caption Composition for Scientific Figures with Machine-Generated Captions and Ratings"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-9082-6039","authenticated-orcid":false,"given":"Ting-Yao","family":"Hsu","sequence":"first","affiliation":[{"name":"Computer Science and Engineering, Pennsylvania State University, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6736-9959","authenticated-orcid":false,"given":"Chieh-Yang","family":"Huang","sequence":"additional","affiliation":[{"name":"College of Information Sciences and Technology, Pennsylvania State University, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4298-5325","authenticated-orcid":false,"given":"Shih-Hong","family":"Huang","sequence":"additional","affiliation":[{"name":"College of Information Sciences and Technology, The Pennsylvania State University, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9758-0635","authenticated-orcid":false,"given":"Ryan","family":"Rossi","sequence":"additional","affiliation":[{"name":"Adobe Research, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3580-5290","authenticated-orcid":false,"given":"Sungchul","family":"Kim","sequence":"additional","affiliation":[{"name":"Adobe Research, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5991-2050","authenticated-orcid":false,"given":"Tong","family":"Yu","sequence":"additional","affiliation":[{"name":"Adobe Research, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1931-585X","authenticated-orcid":false,"given":"C Lee","family":"Giles","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, Pennsylvania State University, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7021-4627","authenticated-orcid":false,"given":"Ting-Hao Kenneth","family":"Huang","sequence":"additional","affiliation":[{"name":"College of Information Sciences and Technology, Pennsylvania State University, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,5,11]]},"reference":[{"key":"e_1_3_3_3_1_1","volume-title":"SciBERT: A pretrained language model for scientific text. arXiv preprint arXiv:1903.10676","author":"Beltagy Iz","year":"2019","unstructured":"Iz Beltagy, Kyle Lo, and Arman Cohan. 2019. SciBERT: A pretrained language model for scientific text. arXiv preprint arXiv:1903.10676 (2019)."},{"key":"e_1_3_3_3_2_1","volume-title":"Human cognition: Learning, understanding, and remembering. (No Title)","author":"Bransford John","year":"1979","unstructured":"John Bransford. 1979. Human cognition: Learning, understanding, and remembering. (No Title) (1979)."},{"key":"e_1_3_3_3_3_1","volume-title":"Informal elements in English academic writing: threats or opportunities for advanced non-native speakers? In Writing: Texts, processes and practices","author":"Chang Yu-Ying","unstructured":"Yu-Ying Chang and John\u00a0M Swales. 2014. Informal elements in English academic writing: threats or opportunities for advanced non-native speakers? In Writing: Texts, processes and practices. Routledge, 145\u2013167."},{"key":"e_1_3_3_3_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093592"},{"key":"e_1_3_3_3_5_1","volume-title":"Intentable: A mixed-initiative system for intent-based chart captioning. In 2022 IEEE Visualization and Visual Analytics (VIS)","author":"Choi Jiwon","year":"2022","unstructured":"Jiwon Choi and Jaemin Jo. 2022. Intentable: A mixed-initiative system for intent-based chart captioning. In 2022 IEEE Visualization and Visual Analytics (VIS). IEEE, 40\u201344."},{"key":"e_1_3_3_3_6_1","volume-title":"Computer Graphics Forum, Vol.\u00a038","author":"Choi Jinho","unstructured":"Jinho Choi, Sanghun Jung, Deok\u00a0Gun Park, Jaegul Choo, and Niklas Elmqvist. 2019. Visualizing for the non-visual: Enabling the visually impaired to use visualization. In Computer Graphics Forum, Vol.\u00a038. Wiley Online Library, 249\u2013260."},{"key":"e_1_3_3_3_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2910896.2910904"},{"key":"e_1_3_3_3_8_1","volume-title":"Investigating academic texts with corpus-based techniques: An example from biology. Linguistics and education 8, 3","author":"Conrad M","year":"1996","unstructured":"Susan\u00a0M Conrad. 1996. Investigating academic texts with corpus-based techniques: An example from biology. Linguistics and education 8, 3 (1996), 299\u2013326."},{"key":"e_1_3_3_3_9_1","doi-asserted-by":"publisher","DOI":"10.1080\/09588221.2011.582687"},{"key":"e_1_3_3_3_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2899475.2899478"},{"key":"e_1_3_3_3_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_3_3_12_1","doi-asserted-by":"publisher","DOI":"10.1037\/e577632012-009"},{"key":"e_1_3_3_3_13_1","doi-asserted-by":"publisher","DOI":"10.1006\/jmla.1993.1036"},{"key":"e_1_3_3_3_14_1","volume-title":"Reader versus writer responsibility: A new typology. Landmark essays on ESL writing","author":"Hinds John","year":"1987","unstructured":"John Hinds, U Connor, and RB Kaplan. 1987. Reader versus writer responsibility: A new typology. Landmark essays on ESL writing (1987), 63\u201374."},{"key":"e_1_3_3_3_15_1","volume-title":"Scitune: Aligning large language models with scientific multimodal instructions. arXiv preprint arXiv:2307.01139","author":"Horawalavithana Sameera","year":"2023","unstructured":"Sameera Horawalavithana, Sai Munikoti, Ian Stewart, and Henry Kvinge. 2023. Scitune: Aligning large language models with scientific multimodal instructions. arXiv preprint arXiv:2307.01139 (2023)."},{"key":"e_1_3_3_3_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_3_3_17_1","volume-title":"The 1st Scientific Figure Captioning (SciCap) Challenge","author":"Hsu Ting-Yao","unstructured":"Ting-Yao Hsu, Yi-Li Hsu, Shaurya Rohatgi, Ryan Rossi, Sungchul Kim, Ani Nenkova, Lun-Wei Ku, Huijuan Xu, C. Giles, and Ting-Hao Huang. 2023. The 1st Scientific Figure Captioning (SciCap) Challenge. http:\/\/scicap.ai\/."},{"key":"e_1_3_3_3_18_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.363"},{"key":"e_1_3_3_3_19_1","volume-title":"Eunyee Koh, Clyde\u00a0Lee Giles, and Ting-Hao\u2019Kenneth","author":"Huang Chieh-Yang","year":"2023","unstructured":"Chieh-Yang Huang, Ting-Yao Hsu, Ryan Rossi, Ani Nenkova, Sungchul Kim, Gromit Yeuk-Yin Chan, Eunyee Koh, Clyde\u00a0Lee Giles, and Ting-Hao\u2019Kenneth\u2019 Huang. 2023. Summaries as Captions: Generating Figure Captions for Scientific Documents with Automated Text Summarization. arXiv preprint arXiv:2302.12324 (2023)."},{"key":"e_1_3_3_3_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDARW.2019.00018"},{"key":"e_1_3_3_3_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00592"},{"key":"e_1_3_3_3_22_1","volume-title":"Figureqa: An annotated figure dataset for visual reasoning. arXiv preprint arXiv:1710.07300","author":"Kahou Samira\u00a0Ebrahimi","year":"2017","unstructured":"Samira\u00a0Ebrahimi Kahou, Vincent Michalski, Adam Atkinson, \u00c1kos K\u00e1d\u00e1r, Adam Trischler, and Yoshua Bengio. 2017. Figureqa: An annotated figure dataset for visual reasoning. arXiv preprint arXiv:1710.07300 (2017)."},{"key":"e_1_3_3_3_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_3_3_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_3_3_25_1","volume-title":"ACL-Fig: A Dataset for Scientific Figure Classification. arXiv preprint arXiv:2301.12293","author":"Karishma Zeba","year":"2023","unstructured":"Zeba Karishma, Shaurya Rohatgi, Kavya\u00a0Shrinivas Puranik, Jian Wu, and C\u00a0Lee Giles. 2023. ACL-Fig: A Dataset for Scientific Figure Classification. arXiv preprint arXiv:2301.12293 (2023)."},{"key":"e_1_3_3_3_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445443"},{"key":"e_1_3_3_3_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445443"},{"key":"e_1_3_3_3_28_1","volume-title":"Proceedings of Machine Translation Summit XII: Papers.","author":"Koehn Philipp","year":"2009","unstructured":"Philipp Koehn and Barry Haddow. 2009. Interactive assistance to human translators using statistical machine translation methods. In Proceedings of Machine Translation Summit XII: Papers."},{"key":"e_1_3_3_3_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3501999"},{"key":"e_1_3_3_3_30_1","doi-asserted-by":"publisher","DOI":"10.5555\/208107.208109"},{"key":"e_1_3_3_3_31_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Lee Kenton","year":"2023","unstructured":"Kenton Lee, Mandar Joshi, Iulia\u00a0Raluca Turc, Hexiang Hu, Fangyu Liu, Julian\u00a0Martin Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, and Kristina Toutanova. 2023. Pix2struct: Screenshot parsing as pretraining for visual language understanding. In International Conference on Machine Learning. PMLR, 18893\u201318912."},{"key":"e_1_3_3_3_32_1","volume-title":"Scigraphqa: A large-scale synthetic multi-turn question-answering dataset for scientific graphs. arXiv preprint arXiv:2308.03349","author":"Li Shengzhi","year":"2023","unstructured":"Shengzhi Li and Nima Tajbakhsh. 2023. Scigraphqa: A large-scale synthetic multi-turn question-answering dataset for scientific graphs. arXiv preprint arXiv:2308.03349 (2023)."},{"key":"e_1_3_3_3_33_1","volume-title":"Inksight: Leveraging sketch interaction for documenting chart findings in computational notebooks","author":"Lin Yanna","year":"2023","unstructured":"Yanna Lin, Haotian Li, Leni Yang, Aoyu Wu, and Huamin Qu. 2023. Inksight: Leveraging sketch interaction for documenting chart findings in computational notebooks. IEEE Transactions on Visualization and Computer Graphics (2023)."},{"key":"e_1_3_3_3_34_1","volume-title":"Autotitle: An interactive title generator for visualizations","author":"Liu Can","year":"2023","unstructured":"Can Liu, Yuhan Guo, and Xiaoru Yuan. 2023. Autotitle: An interactive title generator for visualizations. IEEE Transactions on Visualization and Computer Graphics (2023)."},{"key":"e_1_3_3_3_35_1","volume-title":"Matcha: Enhancing visual language pretraining with math reasoning and chart derendering. arXiv preprint arXiv:2212.09662","author":"Liu Fangyu","year":"2022","unstructured":"Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, and Julian\u00a0Martin Eisenschlos. 2022. Matcha: Enhancing visual language pretraining with math reasoning and chart derendering. arXiv preprint arXiv:2212.09662 (2022)."},{"key":"e_1_3_3_3_36_1","volume-title":"Linecap: Line charts for data visualization captioning models. In 2022 IEEE Visualization and Visual Analytics (VIS)","author":"Mahinpei Anita","year":"2022","unstructured":"Anita Mahinpei, Zona Kostic, and Chris Tanner. 2022. Linecap: Line charts for data visualization captioning models. In 2022 IEEE Visualization and Visual Analytics (VIS). IEEE, 35\u201339."},{"key":"e_1_3_3_3_37_1","volume-title":"UniChart: A Universal Vision-language Pretrained Model for Chart Comprehension and Reasoning. arXiv preprint arXiv:2305.14761","author":"Masry Ahmed","year":"2023","unstructured":"Ahmed Masry, Parsa Kavehzadeh, Xuan\u00a0Long Do, Enamul Hoque, and Shafiq Joty. 2023. UniChart: A Universal Vision-language Pretrained Model for Chart Comprehension and Reasoning. arXiv preprint arXiv:2305.14761 (2023)."},{"key":"e_1_3_3_3_38_1","volume-title":"ChartQA: A benchmark for question answering about charts with visual and logical reasoning. arXiv preprint arXiv:2203.10244","author":"Masry Ahmed","year":"2022","unstructured":"Ahmed Masry, Do\u00a0Xuan Long, Jia\u00a0Qing Tan, Shafiq Joty, and Enamul Hoque. 2022. ChartQA: A benchmark for question answering about charts with visual and logical reasoning. arXiv preprint arXiv:2203.10244 (2022)."},{"key":"e_1_3_3_3_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3428121"},{"key":"e_1_3_3_3_40_1","doi-asserted-by":"publisher","DOI":"10.1177\/002246698301700214"},{"key":"e_1_3_3_3_41_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.inlg-1.20"},{"key":"e_1_3_3_3_42_1","unstructured":"OpenAI. 2022. GPT-3.5: Language Models are Few-Shot Learners. https:\/\/platform.openai.com\/docs\/models\/gpt-3-5."},{"key":"e_1_3_3_3_43_1","unstructured":"OpenAI. 2023. GPT-4V(ision) System Card. https:\/\/api.semanticscholar.org\/CorpusID:263218031"},{"key":"e_1_3_3_3_44_1","doi-asserted-by":"publisher","DOI":"10.58680\/rte197420076"},{"key":"e_1_3_3_3_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442381.3449923"},{"key":"e_1_3_3_3_46_1","volume-title":"Proceedings, Part VII 14","author":"Siegel Noah","year":"2016","unstructured":"Noah Siegel, Zachary Horvitz, Roie Levin, Santosh Divvala, and Ali Farhadi. 2016. Figureseer: Parsing result-figures in research papers. In Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part VII 14. Springer, 664\u2013680."},{"key":"e_1_3_3_3_47_1","volume-title":"FigCaps-HF: A Figure-to-Caption Generative Framework and Benchmark with Human Feedback. arXiv preprint arXiv:2307.10867","author":"Singh Ashish","year":"2023","unstructured":"Ashish Singh, Prateek Agarwal, Zixuan Huang, Arpita Singh, Tong Yu, Sungchul Kim, Victor Bursztyn, Nikos Vlassis, and Ryan\u00a0A Rossi. 2023. FigCaps-HF: A Figure-to-Caption Generative Framework and Benchmark with Human Feedback. arXiv preprint arXiv:2307.10867 (2023)."},{"key":"e_1_3_3_3_48_1","first-page":"1233","article-title":"Striking a balance: reader takeaways and preferences when integrating text and charts","volume":"29","author":"Stokes Chase","year":"2022","unstructured":"Chase Stokes, Vidya Setlur, Bridget Cogley, Arvind Satyanarayan, and Marti\u00a0A Hearst. 2022. Striking a balance: reader takeaways and preferences when integrating text and charts. IEEE Transactions on Visualization and Computer Graphics 29, 1 (2022), 1233\u20131243.","journal-title":"IEEE Transactions on Visualization and Computer Graphics"},{"key":"e_1_3_3_3_49_1","volume-title":"Vistext: A benchmark for semantically rich chart captioning. arXiv preprint arXiv:2307.05356","author":"Tang J","year":"2023","unstructured":"Benny\u00a0J Tang, Angie Boggust, and Arvind Satyanarayan. 2023. Vistext: A benchmark for semantically rich chart captioning. arXiv preprint arXiv:2307.05356 (2023)."},{"key":"e_1_3_3_3_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3563657.3595986"},{"key":"e_1_3_3_3_51_1","volume-title":"A Knowledge Augmented Dataset to Study the Challenges of Scientific Figure Captioning. arXiv preprint arXiv:2306.03491","author":"Yang Zhishen","year":"2023","unstructured":"Zhishen Yang, Raj Dabre, Hideki Tanaka, and Naoaki Okazaki. 2023. SciCap+: A Knowledge Augmented Dataset to Study the Challenges of Scientific Figure Captioning. arXiv preprint arXiv:2306.03491 (2023)."},{"key":"e_1_3_3_3_52_1","volume-title":"mplug-docowl: Modularized multimodal large language model for document understanding. arXiv preprint arXiv:2307.02499","author":"Ye Jiabo","year":"2023","unstructured":"Jiabo Ye, Anwen Hu, Haiyang Xu, Qinghao Ye, Ming Yan, Yuhao Dan, Chenlin Zhao, Guohai Xu, Chenliang Li, Junfeng Tian, 2023. mplug-docowl: Modularized multimodal large language model for document understanding. arXiv preprint arXiv:2307.02499 (2023)."}],"event":{"name":"CHI '24: CHI Conference on Human Factors in Computing Systems","location":"Honolulu HI USA","acronym":"CHI '24","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction","SIGACCESS ACM Special Interest Group on Accessible Computing"]},"container-title":["Extended Abstracts of the CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3613905.3650738","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3613905.3650738","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T23:44:16Z","timestamp":1750290256000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3613905.3650738"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,2]]},"references-count":52,"alternative-id":["10.1145\/3613905.3650738","10.1145\/3613905"],"URL":"https:\/\/doi.org\/10.1145\/3613905.3650738","relation":{},"subject":[],"published":{"date-parts":[[2024,5,2]]},"assertion":[{"value":"2024-05-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}