{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T22:40:11Z","timestamp":1756852811798,"version":"3.44.0"},"publisher-location":"Cham","reference-count":21,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032043535","type":"print"},{"value":"9783032043542","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,9,3]],"date-time":"2025-09-03T00:00:00Z","timestamp":1756857600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,3]],"date-time":"2025-09-03T00:00:00Z","timestamp":1756857600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-04354-2_3","type":"book-chapter","created":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T22:08:56Z","timestamp":1756850936000},"page":"34-41","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Spatially Grounded Explanations in\u00a0VisionLanguage Models for\u00a0Document Visual Question Answering"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-3687-1924","authenticated-orcid":false,"given":"Maximiliano","family":"Hormaz\u00e1bal","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2813-2462","authenticated-orcid":false,"given":"H\u00e9ctor","family":"Cerezo-Costas","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8762-4454","authenticated-orcid":false,"given":"Dimosthenis","family":"Karatzas","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,9,3]]},"reference":[{"key":"3_CR1","doi-asserted-by":"crossref","unstructured":"Appalaraju, S., Jasani, B., Kota, B.U., Xie, Y., Manmatha, R.: DocFormer: end-to-end transformer for document understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 993\u20131003 (2021)","DOI":"10.1109\/ICCV48922.2021.00103"},{"key":"3_CR2","doi-asserted-by":"publisher","unstructured":"Biten, A., et al.: Scene text visual question answering, pp. 4290\u20134300. Proceedings of the IEEE International Conference on Computer Vision, Institute of Electrical and Electronics Engineers Inc., United States (2019). https:\/\/doi.org\/10.1109\/ICCV.2019.00439, funding Information: This work has been supported by projects TIN2017-89779-P, Marie-Curie (712949 TECNIOspring PLUS), aBSINTHE (Fundacion BBVA 2017), the CERCA Programme \/ Generalitat de Catalunya, a European Social Fund grant (CCI: 2014ES05SFOP007), NVIDIA Corporation and PhD scholarships from AGAUR (2019-FIB01233) and the UAB. Publisher Copyright: 2019 IEEE","DOI":"10.1109\/ICCV.2019.00439"},{"key":"3_CR3","doi-asserted-by":"publisher","unstructured":"G\u00f3mez, L., et al.: Multimodal grid features and cell pointers for scene text visual question answering. Pattern Recogn. Lett. 150, 242\u2013249 (2021). https:\/\/doi.org\/10.1016\/j.patrec.2021.06.026, https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0167865521002336","DOI":"10.1016\/j.patrec.2021.06.026"},{"key":"3_CR4","doi-asserted-by":"crossref","unstructured":"Hu, A., et al.: mPLUG-DocOwl2: High-resolution compressing for OCR-free multi-page document understanding (2024). https:\/\/arxiv.org\/abs\/2409.03420","DOI":"10.18653\/v1\/2025.acl-long.291"},{"key":"3_CR5","doi-asserted-by":"publisher","unstructured":"Hu, R., Singh, A., Darrell, T., Rohrbach, M.: Iterative answer prediction with pointer-augmented multimodal transformers for TextVQA . In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9989\u20139999. IEEE Computer Society, Los Alamitos, CA, USA (2020). https:\/\/doi.org\/10.1109\/CVPR42600.2020.01001","DOI":"10.1109\/CVPR42600.2020.01001"},{"key":"3_CR6","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0139, pp. 4904\u20134916. PMLR (2021). https:\/\/proceedings.mlr.press\/v139\/jia21b.html"},{"key":"3_CR7","doi-asserted-by":"crossref","unstructured":"Kim, G., et al.: OCR-free document understanding transformer. In: European Conference on Computer Vision, pp. 498\u2013517. Springer (2022)","DOI":"10.1007\/978-3-031-19815-1_29"},{"key":"3_CR8","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: Krause, A., Brunskill, E., Cho, K., Engelhardt, B., Sabato, S., Scarlett, J. (eds.) Proceedings of the 40th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0202, pp. 19730\u201319742. PMLR (2023). https:\/\/proceedings.mlr.press\/v202\/li23q.html"},{"key":"3_CR9","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: Chaudhuri, K., Jegelka, S., Song, L., Szepesvari, C., Niu, G., Sabato, S. (eds.) Proceedings of the 39th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0162, pp. 12888\u201312900. PMLR (2022). https:\/\/proceedings.mlr.press\/v162\/li22n.html"},{"key":"3_CR10","unstructured":"Li, K., Vosselman, G., Yang, M.Y.: Convincing rationales for visual question answering reasoning (2025). https:\/\/arxiv.org\/abs\/2402.03896"},{"key":"3_CR11","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Oh, A., Naumann, T., Globerson, A., Saenko, K., Hardt, M., Levine, S. (eds.) Advances in Neural Information Processing Systems. vol.\u00a036, pp. 34892\u201334916. Curran Associates, Inc. (2023). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/6dcf277ea32ce3288914faf369fe6de0-Paper-Conference.pdf"},{"key":"3_CR12","doi-asserted-by":"crossref","unstructured":"Mathew, M., Karatzas, D., Jawahar, C.: DocVQA: a dataset for VQA on document images. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2200\u20132209 (2021)","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"3_CR13","unstructured":"Mohammadshirazi, A., Neogi, P.P.G., Lim, S.N., Ramnath, R.: DLaVA: Document language and vision assistant for answer localization with enhanced interpretability and trustworthiness (2024). https:\/\/arxiv.org\/abs\/2412.00151"},{"key":"3_CR14","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"732","DOI":"10.1007\/978-3-030-86331-9_47","volume-title":"Document Analysis and Recognition \u2013 ICDAR 2021","author":"R Powalski","year":"2021","unstructured":"Powalski, R., Borchmann, \u0141, Jurkiewicz, D., Dwojak, T., Pietruszka, M., Pa\u0142ka, G.: Going Full-TILT boogie on document understanding with text-image-layout transformer. In: Llad\u00f3s, J., Lopresti, D., Uchida, S. (eds.) ICDAR 2021. LNCS, vol. 12822, pp. 732\u2013747. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-86331-9_47"},{"key":"3_CR15","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0139, pp. 8748\u20138763. PMLR (2021). https:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"3_CR16","doi-asserted-by":"publisher","unstructured":"Rezatofighi, H., Tsoi, N., Gwak, J., Sadeghian, A., Reid, I., Savarese, S.: Generalized intersection over union: a metric and a loss for bounding box regression . In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 658\u2013666. IEEE Computer Society, Los Alamitos, CA, USA (2019). https:\/\/doi.org\/10.1109\/CVPR.2019.00075","DOI":"10.1109\/CVPR.2019.00075"},{"key":"3_CR17","doi-asserted-by":"crossref","unstructured":"Saifullah, S., Agne, S., Dengel, A., Ahmed, S.: DocXplain: a novel model-agnostic explainability method for document image classification. In: International Conference on Document Analysis and Recognition, pp. 103\u2013123. Springer (2024)","DOI":"10.1007\/978-3-031-70546-5_7"},{"key":"3_CR18","doi-asserted-by":"publisher","unstructured":"Selvaraju, R.R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., Batra, D.: Grad-CAM: visual explanations from deep networks via gradient-based localization. In: 2017 IEEE International Conference on Computer Vision (ICCV), pp. 618\u2013626 (2017). https:\/\/doi.org\/10.1109\/ICCV.2017.74","DOI":"10.1109\/ICCV.2017.74"},{"key":"3_CR19","doi-asserted-by":"publisher","unstructured":"Singh, A., et al.: FLAVA: a foundational language and vision alignment model. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15617\u201315629 (2022). https:\/\/doi.org\/10.1109\/CVPR52688.2022.01519","DOI":"10.1109\/CVPR52688.2022.01519"},{"key":"3_CR20","doi-asserted-by":"publisher","unstructured":"Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: LayoutLM: pre-training of text and layout for document image understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp. 1192\u20131200. KDD \u201920, ACM (2020). https:\/\/doi.org\/10.1145\/3394486.3403172","DOI":"10.1145\/3394486.3403172"},{"key":"3_CR21","unstructured":"Zhou, Y., et al.: Large language models are human-level prompt engineers. In: The Eleventh International Conference on Learning Representations (2022)"}],"container-title":["Lecture Notes in Computer Science","Experimental IR Meets Multilinguality, Multimodality, and Interaction"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-04354-2_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T22:09:13Z","timestamp":1756850953000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-04354-2_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,3]]},"ISBN":["9783032043535","9783032043542"],"references-count":21,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-04354-2_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9,3]]},"assertion":[{"value":"3 September 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"CLEF","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference of the Cross-Language Evaluation Forum for European Languages","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Madrid","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Spain","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"clef2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/clef2025.clef-initiative.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}