{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,22]],"date-time":"2025-11-22T11:24:35Z","timestamp":1763810675168,"version":"3.44.0"},"publisher-location":"Cham","reference-count":36,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030863364"},{"type":"electronic","value":"9783030863371"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-86337-1_42","type":"book-chapter","created":{"date-parts":[[2021,9,3]],"date-time":"2021-09-03T20:48:12Z","timestamp":1630702092000},"page":"635-649","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":22,"title":["ICDAR 2021 Competition on Document Visual Question Answering"],"prefix":"10.1007","author":[{"given":"Rub\u00e8n","family":"Tito","sequence":"first","affiliation":[]},{"given":"Minesh","family":"Mathew","sequence":"additional","affiliation":[]},{"given":"C. V.","family":"Jawahar","sequence":"additional","affiliation":[]},{"given":"Ernest","family":"Valveny","sequence":"additional","affiliation":[]},{"given":"Dimosthenis","family":"Karatzas","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,9,2]]},"reference":[{"key":"42_CR1","doi-asserted-by":"crossref","unstructured":"Agrawal, A., et al.: VQA: Visual Question Answering (2016)","DOI":"10.1007\/s11263-016-0966-6"},{"key":"42_CR2","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering (2017)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"42_CR3","doi-asserted-by":"crossref","unstructured":"Biten, A.F., et al.: ICDAR 2019 competition on scene text visual question answering. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1563\u20131570. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00251"},{"key":"42_CR4","doi-asserted-by":"crossref","unstructured":"Biten, A.F., et al.: Scene text visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4291\u20134301 (2019)","DOI":"10.1109\/ICCV.2019.00439"},{"key":"42_CR5","doi-asserted-by":"crossref","unstructured":"Chaudhry, R., Shekhar, S., Gupta, U., Maneriker, P., Bansal, P., Joshi, A.: Leaf-QA: locate, encode attend for figure question answering. In: WACV (2020)","DOI":"10.1109\/WACV45572.2020.9093269"},{"key":"42_CR6","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: ACL (2019)"},{"key":"42_CR7","unstructured":"Dua, D., Wang, Y., Dasigi, P., Stanovsky, G., Singh, S., Gardner, M.: DROP: a reading comprehension benchmark requiring discrete reasoning over paragraphs. In: NAACL-HLT (2019)"},{"key":"42_CR8","unstructured":"Hong, T., Kim, D., Ji, M., Hwang, W., Nam, D., Park, S.: Bros: a pre-trained language model for understanding texts in document (2021)"},{"key":"42_CR9","doi-asserted-by":"crossref","unstructured":"Hu, R., Singh, A., Darrell, T., Rohrbach, M.: Iterative answer prediction with pointer-augmented multimodal transformers for TextVQA. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2020)","DOI":"10.1109\/CVPR42600.2020.01001"},{"key":"42_CR10","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for compositional question answering over real-world images. CoRR abs\/1902.09506 (2019). http:\/\/arxiv.org\/abs\/1902.09506"},{"key":"42_CR11","unstructured":"Jain, T., Lennan, C., John, Z., Tran, D.: Imagededup (2019). https:\/\/github.com\/idealo\/imagededup"},{"key":"42_CR12","doi-asserted-by":"crossref","unstructured":"Joshi, M., Choi, E., Weld, D., Zettlemoyer, L.: TriviaQA: a large scale distantly supervised challenge dataset for reading comprehension. In: ACL (2017)","DOI":"10.18653\/v1\/P17-1147"},{"key":"42_CR13","doi-asserted-by":"crossref","unstructured":"Kafle, K., Price, B., Cohen, S., Kanan, C.: DVQA: understanding data visualizations via question answering. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00592"},{"key":"42_CR14","unstructured":"Kahou, S.E., Michalski, V., Atkinson, A., K\u00e1d\u00e1r, \u00c1., Trischler, A., Bengio, Y.: FigureQA: an annotated figure dataset for visual reasoning. arXiv preprint arXiv:1710.07300 (2017)"},{"key":"42_CR15","doi-asserted-by":"crossref","unstructured":"Kembhavi, A., Seo, M., Schwenk, D., Choi, J., Farhadi, A., Hajishirzi, H.: Are you smarter than a sixth grader? Textbook question answering for multimodal machine comprehension. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.571"},{"key":"42_CR16","doi-asserted-by":"crossref","unstructured":"Kwiatkowski, T., et al.: Natural questions: a benchmark for question answering research. Transactions of the Association of Computational Linguistics (2019)","DOI":"10.1162\/tacl_a_00276"},{"key":"42_CR17","doi-asserted-by":"crossref","unstructured":"Lewis, D., Agam, G., Argamon, S., Frieder, O., Grossman, D., Heard, J.: Building a test collection for complex document information processing. In: Proceedings of the 29th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 665\u2013666 (2006)","DOI":"10.1145\/1148170.1148307"},{"key":"42_CR18","unstructured":"Madan, S., et al.: Synthetically trained icon proposals for parsing and summarizing infographics. arXiv preprint arXiv:1807.10441 (2018)"},{"key":"42_CR19","doi-asserted-by":"crossref","unstructured":"Mathew, M., Bagal, V., Tito, R.P., Karatzas, D., Valveny, E., Jawahar, C.: InfographicVQA. arXiv preprint arXiv:2104.12756 (2021)","DOI":"10.1109\/WACV51458.2022.00264"},{"key":"42_CR20","doi-asserted-by":"crossref","unstructured":"Mathew, M., Karatzas, D., Jawahar, C.V.: DocVQA: a dataset for VQA on document images. In: WACV (2020)","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"42_CR21","unstructured":"Mathew, M., Tito, R., Karatzas, D., Manmatha, R., Jawahar, C.: Document visual question answering challenge 2020. arXiv preprint arXiv:2008.08899 (2020)"},{"key":"42_CR22","unstructured":"Nguyen, T., et al.: MS MARCO: a human generated machine reading comprehension dataset. CoRR abs\/1611.09268 (2016)"},{"key":"42_CR23","doi-asserted-by":"crossref","unstructured":"Pasupat, P., Liang, P.: Compositional semantic parsing on semi-structured tables. In: Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 1470\u20131480 (2015)","DOI":"10.3115\/v1\/P15-1142"},{"key":"42_CR24","doi-asserted-by":"crossref","unstructured":"Powalski, R., Borchmann, \u0141., Jurkiewicz, D., Dwojak, T., Pietruszka, M., Pa\u0142ka, G.: Going full-tilt boogie on document understanding with text-image-layout transformer. arXiv preprint arXiv:2102.09550 (2021)","DOI":"10.1007\/978-3-030-86331-9_47"},{"key":"42_CR25","doi-asserted-by":"crossref","unstructured":"Rajpurkar, P., Zhang, J., Lopyrev, K., Liang, P.: Squad: 100,000+ questions for machine comprehension of text. In: Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing, pp. 2383\u20132392 (2016)","DOI":"10.18653\/v1\/D16-1264"},{"key":"42_CR26","doi-asserted-by":"crossref","unstructured":"Singh, A., et al.: Towards VQA models that can read. In: Proceedings of the IEEE\/CVF CVPR, pp. 8317\u20138326 (2019)","DOI":"10.1109\/CVPR.2019.00851"},{"key":"42_CR27","doi-asserted-by":"crossref","unstructured":"Teney, D., Anderson, P., He, X., van den Hengel, A.: Tips and tricks for visual question answering: learnings from the 2017 challenge (2017)","DOI":"10.1109\/CVPR.2018.00444"},{"key":"42_CR28","doi-asserted-by":"crossref","unstructured":"Tito, R., Karatzas, D., Valveny, E.: Document collection visual question answering. arXiv preprint arXiv:2104.14336 (2021)","DOI":"10.1007\/978-3-030-86331-9_50"},{"key":"42_CR29","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Proceedings of the 31st International Conference on NeurIPSal Information Processing Systems, pp. 6000\u20136010 (2017)"},{"key":"42_CR30","unstructured":"Veit, A., Matera, T., Neumann, L., Matas, J., Belongie, S.: Coco-text: dataset and benchmark for text detection and recognition in natural images (2016)"},{"key":"42_CR31","unstructured":"Wang, W., et al.: StructBERT: incorporating language structures into pre-training for deep language understanding. arXiv preprint arXiv:1908.04577 (2019)"},{"key":"42_CR32","doi-asserted-by":"crossref","unstructured":"Xu, Y., et al.: LayoutLMv2: multi-modal pre-training for visually-rich document understanding. arXiv preprint arXiv:2012.14740 (2020)","DOI":"10.18653\/v1\/2021.acl-long.201"},{"key":"42_CR33","doi-asserted-by":"crossref","unstructured":"Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: LayoutLM: pre-training of text and layout for document image understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp. 1192\u20131200 (2020)","DOI":"10.1145\/3394486.3403172"},{"key":"42_CR34","doi-asserted-by":"crossref","unstructured":"Yagcioglu, S., Erdem, A., Erdem, E., Ikizler-Cinbis, N.: RecipeQA: a challenge dataset for multimodal comprehension of cooking recipes. In: EMNLP (2018)","DOI":"10.18653\/v1\/D18-1166"},{"key":"42_CR35","unstructured":"Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R.R., Le, Q.V.: XLNet: generalized autoregressive pretraining for language understanding. In: NeurIPS (2019)"},{"key":"42_CR36","doi-asserted-by":"crossref","unstructured":"Zhu, Q., Gao, C., Wang, P., Wu, Q.: Simple is not easy: a simple strong baseline for TextVQA and TextCaps. arXiv preprint arXiv:2012.05153 (2020)","DOI":"10.1609\/aaai.v35i4.16476"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition \u2013 ICDAR 2021"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-86337-1_42","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T22:09:00Z","timestamp":1756850940000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-86337-1_42"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030863364","9783030863371"],"references-count":36,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-86337-1_42","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"2 September 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lausanne","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Switzerland","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 September 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10 September 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/iapr.org\/icdar2021","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"340","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"182","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"54% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.9","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4.9","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Additionally, 13 competition reports are included.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}