{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T20:11:26Z","timestamp":1780344686853,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":29,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,7,18]],"date-time":"2023-07-18T00:00:00Z","timestamp":1689638400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,7,19]]},"DOI":"10.1145\/3539618.3591875","type":"proceedings-article","created":{"date-parts":[[2023,7,19]],"date-time":"2023-07-19T00:22:23Z","timestamp":1689726143000},"page":"2691-2700","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["BizGraphQA: A Dataset for Image-based Inference over Graph-structured Diagrams from Business Domains"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-2737-9820","authenticated-orcid":false,"given":"Petr","family":"Babkin","sequence":"first","affiliation":[{"name":"J.P. Morgan AI Research, Palo Alto, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5516-262X","authenticated-orcid":false,"given":"William","family":"Watson","sequence":"additional","affiliation":[{"name":"J.P. Morgan AI Research, New York, NY, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7460-5654","authenticated-orcid":false,"given":"Zhiqiang","family":"Ma","sequence":"additional","affiliation":[{"name":"J.P. Morgan AI Research, New York, NY, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4501-3710","authenticated-orcid":false,"given":"Lucas","family":"Cecchi","sequence":"additional","affiliation":[{"name":"J.P. Morgan AI Research, New York, NY, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8866-1482","authenticated-orcid":false,"given":"Natraj","family":"Raman","sequence":"additional","affiliation":[{"name":"J.P. Morgan AI Research, London, United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1908-8679","authenticated-orcid":false,"given":"Armineh","family":"Nourbakhsh","sequence":"additional","affiliation":[{"name":"J.P. Morgan AI Research, New York, NY, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5960-5811","authenticated-orcid":false,"given":"Sameena","family":"Shah","sequence":"additional","affiliation":[{"name":"J.P. Morgan AI Research, New York, NY, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,7,18]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Visual FUDGE: Form Understanding via Dynamic Graph Editing. arXiv preprint arXiv:2105.08194","author":"Davis Brian","year":"2021","unstructured":"Brian Davis, Bryan Morse, Brian Price, Chris Tensmeyer, and Curtis Wiginton. 2021. Visual FUDGE: Form Understanding via Dynamic Graph Editing. arXiv preprint arXiv:2105.08194 (2021)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco_a_01273"},{"key":"e_1_3_2_1_3_1","volume-title":"Making the V in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Goyal Yash","year":"2017","unstructured":"Yash Goyal, Tejas Khot, Douglas Summers-Stay, Dhruv Batra, and Devi Parikh. 2017. Making the V in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_4_1","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising Diffusion Probabilistic Models. arXiv:2006.11239 [cs.LG]"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019"},{"key":"e_1_3_2_1_6_1","volume-title":"Manning","author":"Hudson Drew A.","year":"2019","unstructured":"Drew A. Hudson and Christopher D. Manning. 2019. Learning by Abstraction: The Neural State Machine.. In NeurIPS, Hanna M. Wallach, Hugo Larochelle, Alina Beygelzimer, Florence d'Alch\u00e9 Buc, Emily B. Fox, and Roman Garnett (Eds.). 5901--5914. http:\/\/dblp.uni-trier.de\/db\/conf\/nips\/nips2019.html#HudsonM19"},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Johnson Justin","unstructured":"Justin Johnson, Bharath Hariharan, Laurens van der Maaten, Li Fei-Fei, C. Lawrence Zitnick, and Ross Girshick. 2017. CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"e_1_3_2_1_8_1","volume-title":"DVQA: Understanding Data Visualizations via Question Answering. In CVPR.","author":"Kafle Kushal","year":"2018","unstructured":"Kushal Kafle, Scott Cohen, Brian Price, and Christopher Kanan. 2018. DVQA: Understanding Data Visualizations via Question Answering. In CVPR."},{"key":"e_1_3_2_1_9_1","volume-title":"FigureQA: An Annotated Figure Dataset for Visual Reasoning. ArXiv abs\/1710.07300","author":"Kahou Samira Ebrahimi","year":"2017","unstructured":"Samira Ebrahimi Kahou, Adam Atkinson, Vincent Michalski, \u00c1kos K\u00e1d\u00e1r, Adam Trischler, and Yoshua Bengio. 2017. FigureQA: An Annotated Figure Dataset for Visual Reasoning. ArXiv abs\/1710.07300 (2017)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_15"},{"key":"e_1_3_2_1_11_1","unstructured":"Geewook Kim Teakgyu Hong Moonbin Yim Jeongyeon Nam Jinyoung Park Jinyeong Yim Wonseok Hwang Sangdoo Yun Dongyoon Han and Seunghyun Park. 2022. OCR-free Document Understanding Transformer. arXiv:2111.15664 [cs.LG]"},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision. In Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 139), Marina Meila and Tong Zhang (Eds.). PMLR, 5583--5594. https:\/\/proceedings.mlr.press\/v139\/kim21k.html"},{"key":"e_1_3_2_1_13_1","volume-title":"Kingma and Jimmy Ba","author":"Diederik","year":"2015","unstructured":"Diederik P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings, Yoshua Bengio and Yann LeCun (Eds.). http:\/\/arxiv.org\/abs\/1412.6980"},{"key":"e_1_3_2_1_14_1","volume-title":"Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations. https:\/\/arxiv.org\/abs\/1602.07332","author":"Krishna Ranjay","year":"2016","unstructured":"Ranjay Krishna, Yuke Zhu, Oliver Groth, Justin Johnson, Kenji Hata, Joshua Kravitz, Stephanie Chen, Yannis Kalantidis, Li-Jia Li, David A Shamma, Michael Bernstein, and Li Fei-Fei. 2016. Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations. https:\/\/arxiv.org\/abs\/1602.07332"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","unstructured":"Chen-Yu Lee Chun-Liang Li Chu Wang Renshen Wang Yasuhisa Fujii Siyang Qin Ashok Popat and Tomas Pfister. 2021. ROPE: Reading Order Equivariant Positional Encoding for Graph-based Document Information Extraction. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers). Association for Computational Linguistics Online 314--321. https:\/\/doi.org\/10.18653\/v1\/2021.acl-short.41","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_16_1","volume-title":"Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks. In Computer Vision - ECCV","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, Yejin Choi, and Jianfeng Gao. 2020. Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks. In Computer Vision - ECCV 2020, Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm (Eds.). Springer International Publishing, Cham, 121--137."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_18_1","volume-title":"IconQA: A New Benchmark for Abstract Diagram Understanding and Visual Language Reasoning. In The 35th Conference on Neural Information Processing Systems (NeurIPS) Track on Datasets and Benchmarks.","author":"Lu Pan","year":"2021","unstructured":"Pan Lu, Liang Qiu, Jiaqi Chen, Tony Xia, Yizhou Zhao, Wei Zhang, Zhou Yu, Xiaodan Liang, and Song-Chun Zhu. 2021. IconQA: A New Benchmark for Abstract Diagram Understanding and Visual Language Reasoning. In The 35th Conference on Neural Information Processing Systems (NeurIPS) Track on Datasets and Benchmarks."},{"key":"e_1_3_2_1_19_1","volume-title":"InfographicVQA. In 2022 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV). 2582--2591","author":"Mathew Minesh","year":"2022","unstructured":"Minesh Mathew, Viraj Bagal, Rub\u00e8n Tito, Dimosthenis Karatzas, Ernest Valveny, and C. V. Jawahar. 2022. InfographicVQA. In 2022 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV). 2582--2591. https:\/\/doi.org\/10.1109\/ WACV51458.2022.00264"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"e_1_3_2_1_21_1","unstructured":"Mehdi Mirza and Simon Osindero. 2014. Conditional Generative Adversarial Nets. arXiv:1411.1784 [cs.LG]"},{"key":"e_1_3_2_1_23_1","volume-title":"4th International Conference on Learning Representations, ICLR","author":"Radford Alec","year":"2016","unstructured":"Alec Radford, Luke Metz, and Soumith Chintala. 2016. Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks. In 4th International Conference on Learning Representations, ICLR 2016, San Juan, Puerto Rico, May 2-4, 2016, Conference Track Proceedings, Yoshua Bengio and Yann LeCun (Eds.). http:\/\/arxiv.org\/abs\/1511.06434"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1264"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.108660"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Hao Tan and Mohit Bansal. 2019. LXMERT: Learning Cross-Modality Encoder Representations from Transformers. In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP). Association for Computational Linguistics Hong Kong China 5100--5111. https:\/\/doi.org\/10. 18653\/v1\/D19-1514","DOI":"10.18653\/v1\/D19-1514"},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of DI 2022: The 3rd Workshop on Document Intelligence. KDD","author":"Tanner Simon","year":"2022","unstructured":"Simon Tanner, Marcelo Feighelstein, Jasmina Bogojeska, Joseph Shtok, Assef Arbelle, Peter Staar, Anika Schumann, Jonas Kuhn, and Leonid Karlinsky. 2022. FlowchartQA: The First Large-Scale Benchmark for Reasoning over Flowcharts. In Proceedings of DI 2022: The 3rd Workshop on Document Intelligence. KDD, Washington, DC."},{"key":"e_1_3_2_1_28_1","volume-title":"Shrdlu: A system for dialog.","author":"Terry Winograd","year":"1972","unstructured":"Terry Winograd et al. 1972. Shrdlu: A system for dialog."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403172"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"}],"event":{"name":"SIGIR '23: The 46th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Taipei Taiwan","acronym":"SIGIR '23","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 46th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3539618.3591875","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3539618.3591875","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:38:07Z","timestamp":1750178287000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3539618.3591875"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,7,18]]},"references-count":29,"alternative-id":["10.1145\/3539618.3591875","10.1145\/3539618"],"URL":"https:\/\/doi.org\/10.1145\/3539618.3591875","relation":{},"subject":[],"published":{"date-parts":[[2023,7,18]]},"assertion":[{"value":"2023-07-18","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}