{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T18:27:21Z","timestamp":1763922441041,"version":"3.45.0"},"publisher-location":"Cham","reference-count":40,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032093677","type":"print"},{"value":"9783032093684","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T00:00:00Z","timestamp":1763942400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T00:00:00Z","timestamp":1763942400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-09368-4_12","type":"book-chapter","created":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T18:14:05Z","timestamp":1763921645000},"page":"192-208","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Visual Document Matching for\u00a0Zero-Shot Document Classification"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-6952-4731","authenticated-orcid":false,"given":"Lucas","family":"De Almeida Bandeira Macedo","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7379-0961","authenticated-orcid":false,"given":"Joao Paulo","family":"Vieira Costa","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3367-8033","authenticated-orcid":false,"given":"Joao Pedro","family":"Felix De Almeida","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0866-658X","authenticated-orcid":false,"given":"Pedro","family":"Garcia Freitas","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1826-1850","authenticated-orcid":false,"given":"Li","family":"Weigang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,24]]},"reference":[{"key":"12_CR1","doi-asserted-by":"publisher","first-page":"223","DOI":"10.1016\/j.neucom.2021.04.114","volume":"453","author":"L Liu","year":"2021","unstructured":"Liu, L., Wang, Z., Qiu, T., Chen, Q., Yue, L., Suen, C.Y.: Document image classification: progress over two decades. Neurocomputing 453, 223\u2013240 (2021)","journal-title":"Neurocomputing"},{"issue":"9","key":"12_CR2","doi-asserted-by":"publisher","first-page":"2251","DOI":"10.1109\/TPAMI.2018.2857768","volume":"41","author":"Y Xian","year":"2019","unstructured":"Xian, Y., Lampert, C.H., Schiele, B., Akata, Z.: Zero-shot learning - a comprehensive evaluation of the good, the bad and the ugly. IEEE Trans. Pattern Anal. Mach. Intell. 41(9), 2251\u20132265 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"12_CR3","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event. Proceedings of Machine Learning Research, vol. 139, pp. 8748\u20138763. PMLR (2021)"},{"key":"12_CR4","doi-asserted-by":"crossref","unstructured":"Larson, S., Lim, G., Leach, K.: On evaluation of document classification with RVL-CDIP. In: Vlachos, A., Augenstein, I. (eds.) Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics, Dubrovnik, Croatia, pp. 2665\u20132678. Association for Computational Linguistics (2023)","DOI":"10.18653\/v1\/2023.eacl-main.195"},{"key":"12_CR5","doi-asserted-by":"crossref","unstructured":"Jha, A., Samavedhi, A., Rakesh, V., Chandrashekar, J., Reddy, C.K.: Transformer-based models for long-form document matching: challenges and empirical analysis. In: Vlachos, A., Augenstein, I. (eds.) Findings of the Association for Computational Linguistics: EACL 2023, Dubrovnik, Croatia, May 2-6, 2023, pp. 2300\u20132310. Association for Computational Linguistics (2023)","DOI":"10.18653\/v1\/2023.findings-eacl.178"},{"key":"12_CR6","doi-asserted-by":"crossref","unstructured":"Lewis, D.D., Agam, G., Argamon, S., Frieder, O., Grossman, D.A., Heard, J.: Building a test collection for complex document information processing. In: Efthimiadis, E.N., Dumais, S.T., Hawking, D., J\u00e4rvelin, K. (eds.) SIGIR 2006: Proceedings of the 29th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, Seattle, Washington, USA, August 6-11, 2006, pp. 665\u2013666. ACM (2006)","DOI":"10.1145\/1148170.1148307"},{"key":"12_CR7","doi-asserted-by":"crossref","unstructured":"Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: 13th International Conference on Document Analysis and Recognition, ICDAR 2015, Nancy, France, August 23\u201326, 2015, pp. 991\u2013995. IEEE Computer Society (2015)","DOI":"10.1109\/ICDAR.2015.7333910"},{"key":"12_CR8","doi-asserted-by":"crossref","unstructured":"Mathew, M., Karatzas, D., Jawahar, C.V.: DocVQA: a dataset for VQA on document images. In: IEEE Winter Conference on Applications of Computer Vision, WACV 2021, Waikoloa, HI, USA, January 3\u20138, 2021, pp. 2199\u20132208. IEEE (2021)","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"12_CR9","doi-asserted-by":"crossref","unstructured":"Jaume, G., Ekenel, G.K., Thiran, J.-P.: FUNSD: a dataset for form understanding in noisy scanned documents. In: 2019 International Conference on Document Analysis and Recognition Workshops (ICDARW), pp. 1\u20136 (2019)","DOI":"10.1109\/ICDARW.2019.10029"},{"key":"12_CR10","doi-asserted-by":"crossref","unstructured":"Huang, Z., et al.: ICDAR2019 competition on scanned receipt OCR and information extraction. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1516\u20131520. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00244"},{"key":"12_CR11","unstructured":"Park, S., Shin, S., Kim, B., Cha, J., Lee, H.: CORD: a consolidated receipt dataset for post-OCR parsing. In: Document Intelligence Workshop at NeurIPS 2019 (2019)"},{"key":"12_CR12","unstructured":"Xu, Y., et al.: LayoutXLM: multimodal pre-training for multilingual visually-rich document understanding. CoRR, abs\/2104.08836 (2021)"},{"key":"12_CR13","doi-asserted-by":"crossref","unstructured":"Fujinuma, Y., Varia, S., Sankaran, N., Appalaraju, S., Min, B., Vyas, Y.: A multi-modal multilingual benchmark for document image classification. In: Bouamor, H., Pino, J., Bali, K., (eds.) Findings of the Association for Computational Linguistics: EMNLP 2023, Singapore, December 6\u201310, 2023, pp. 14361\u201314376. Association for Computational Linguistics (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.958"},{"key":"12_CR14","doi-asserted-by":"publisher","unstructured":"Youssef, A., Valvano, G., Veneri, G.: Document layout analysis with variational autoencoders: an industrial application. In: Ceci, M., Flesca, S., Masciari, E., Manco, G., Ras, Z.W. (eds.) ISMIS 2022. LNCS, vol. 13515, pp. 477\u2013486. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-16564-1_46","DOI":"10.1007\/978-3-031-16564-1_46"},{"key":"12_CR15","doi-asserted-by":"publisher","unstructured":"Sinha, S., Khan, M.S.U., Sheikh, T.U., Stricker, D., Afzal, M.Z.: CICA: content-injected contrastive alignment for zero-shot document image classification. In: Smith, E.H.B., Liwicki, M., Peng, L. (eds.) ICDAR 2024, Part IV. LNCS, vol. 14807, pp. 124\u2013141. Springer, Cham (2024). https:\/\/doi.org\/10.1007\/978-3-031-70546-5_8","DOI":"10.1007\/978-3-031-70546-5_8"},{"key":"12_CR16","doi-asserted-by":"publisher","unstructured":"Landeghem, J.V., et al.: DistilDoc: knowledge distillation for visually-rich document applications. In: Smith, E.H.B., Liwicki, M., Peng, L. (eds.) ICDAR 2024, Part IV. LNCS, vol. 14807, pp. 195\u2013217. Springer, Cham (2024). https:\/\/doi.org\/10.1007\/978-3-031-70546-5_12","DOI":"10.1007\/978-3-031-70546-5_12"},{"key":"12_CR17","doi-asserted-by":"publisher","unstructured":"Scius-Bertrand, A., Jungo, M., V\u00f6gtlin, L., Spat, J.-M., Fischer, A.: Zero-shot prompting and few-shot fine-tuning: revisiting document image classification using large language models. In: Antonacopoulos, A., Chaudhuri, S., Chellappa, R., Liu, C.-L., Bhattacharya, S., Pal, U. (eds.) ICPR 2024, Part XIX. LNCS, vol. 15319, pp. 152\u2013166. Springer, Cham (2024). https:\/\/doi.org\/10.1007\/978-3-031-78495-8_10","DOI":"10.1007\/978-3-031-78495-8_10"},{"key":"12_CR18","unstructured":"Meta AI. Llama 3.2: From cloud to edge, now with vision (2024). https:\/\/ai.meta.com\/blog\/llama-3-2-connect-2024-vision-edge-mobile-devices\/"},{"key":"12_CR19","unstructured":"Chen, Z., et al.: Expanding performance boundaries of open-source multimodal models with model, data, and test-time scaling. CoRR, abs\/2412.05271 (2024)"},{"key":"12_CR20","unstructured":"OpenAI. Hello gpt-4o (2024). https:\/\/openai.com\/index\/hello-gpt-4o\/"},{"key":"12_CR21","unstructured":"OpenAI. Gpt-4o mini: advancing cost-efficient intelligence (2024). https:\/\/openai.com\/index\/gpt-4o-mini-advancing-cost-efficient-intelligence\/"},{"key":"12_CR22","unstructured":"Bai, S., et\u00a0al. Qwen2.5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)"},{"key":"12_CR23","doi-asserted-by":"crossref","unstructured":"Ward Jr., J.H.: Hierarchical grouping to optimize an objective function. J. Am. Stat. Assoc. 58(301), 236\u2013244 (1963)","DOI":"10.1080\/01621459.1963.10500845"},{"key":"12_CR24","doi-asserted-by":"crossref","unstructured":"Xian, Y., Lorenz, T., Schiele, B., Akata, Z.: Feature generating networks for zero-shot learning. In: 2018 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2018, Salt Lake City, UT, USA, June 18\u201322, 2018, pp. 5542\u20135551. Computer Vision Foundation\/IEEE Computer Society (2018)","DOI":"10.1109\/CVPR.2018.00581"},{"issue":"8","key":"12_CR25","doi-asserted-by":"publisher","first-page":"1586","DOI":"10.1109\/TKDE.2019.2912815","volume":"32","author":"T-T Wong","year":"2020","unstructured":"Wong, T.-T., Yeh, P.-Y.: Reliable accuracy estimates from k-fold cross validation. IEEE Trans. Knowl. Data Eng. 32(8), 1586\u20131594 (2020)","journal-title":"IEEE Trans. Knowl. Data Eng."},{"key":"12_CR26","unstructured":"Google Research. Google colaboratory (2024). https:\/\/colab.research.google.com\/"},{"key":"12_CR27","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016, Las Vegas, NV, USA, June 27-30, 2016, pp. 770\u2013778. IEEE Computer Society (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"12_CR28","doi-asserted-by":"crossref","unstructured":"Howard, A., et al.: Searching for mobilenetv3. In: 2019 IEEE\/CVF International Conference on Computer Vision, ICCV 2019, Seoul, Korea (South), October 27 - November 2, 2019, pp. 1314\u20131324. IEEE (2019)","DOI":"10.1109\/ICCV.2019.00140"},{"key":"12_CR29","unstructured":"Tan, M., Le, Q.V.: Efficientnet: rethinking model scaling for convolutional neural networks. In: Chaudhuri, K., Salakhutdinov, R. (eds.) Proceedings of the 36th International Conference on Machine Learning, ICML 2019, 9-15 June 2019, Long Beach, California, USA, Proceedings of Machine Learning Research, vol.\u00a097, pp. 6105\u20136114. PMLR (2019)"},{"key":"12_CR30","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: Bengio, Y., LeCun, Y. (eds.) 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7\u20139, 2015, Conference Track Proceedings (2015)"},{"key":"12_CR31","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3\u20137, 2021. OpenReview.net (2021)"},{"key":"12_CR32","doi-asserted-by":"crossref","unstructured":"Chopra, S., Hadsell, R., LeCun, Y.: Learning a similarity metric discriminatively, with application to face verification. In: 2005 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR 2005), 20\u201326 June 2005, San Diego, CA, USA, pp. 539\u2013546. IEEE Computer Society (2005)","DOI":"10.1109\/CVPR.2005.202"},{"key":"12_CR33","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"723","DOI":"10.1007\/978-3-030-86337-1_48","volume-title":"Document Analysis and Recognition - ICDAR 2021","author":"R Tolosana","year":"2021","unstructured":"Tolosana, R., et al.: ICDAR 2021 competition on on-line signature verification. In: Llad\u00f3s, J., Lopresti, D., Uchida, S. (eds.) ICDAR 2021. LNCS, vol. 12824, pp. 723\u2013737. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-86337-1_48"},{"key":"12_CR34","doi-asserted-by":"crossref","unstructured":"Hofbauer, H., Uhl, A.: Calculating a boundary for the significance from the equal-error rate. In: 2016 International Conference on Biometrics (ICB), pp. 1\u20134 (2016)","DOI":"10.1109\/ICB.2016.7550053"},{"key":"12_CR35","doi-asserted-by":"crossref","unstructured":"Agrawal, P., Kapoor, R., Agrawal, S.: A hybrid partial fingerprint matching algorithm for estimation of equal error rate. In: 2014 IEEE International Conference on Advanced Communications, Control and Computing Technologies, pp. 1295\u20131299 (2014)","DOI":"10.1109\/ICACCCT.2014.7019308"},{"key":"12_CR36","doi-asserted-by":"crossref","unstructured":"Robbins, H., Monro, S.: A stochastic approximation method. Ann. Math. Stat., 400\u2013407 (1951)","DOI":"10.1214\/aoms\/1177729586"},{"key":"12_CR37","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization (2015)"},{"key":"12_CR38","unstructured":"Liu, Z.: Super convergence cosine annealing with warm-up learning rate. In: CAIBDA 2022, 2nd International Conference on Artificial Intelligence, Big Data and Algorithms, Nanjing, China, 17-19 June 2022, pp. 1\u20137. VDE\/IEEE (2022)"},{"key":"12_CR39","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: 2009 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR 2009), 20-25 June 2009, Miami, Florida, USA, pp. 248\u2013255. IEEE Computer Society (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"issue":"86","key":"12_CR40","first-page":"2579","volume":"9","author":"L van der Maaten","year":"2008","unstructured":"van der Maaten, L., Hinton, G.: Visualizing Data using t-SNE. J. Mach. Learn. Res. 9(86), 2579\u20132605 (2008)","journal-title":"J. Mach. Learn. Res."}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition \u2013 ICDAR 2025 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-09368-4_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T18:14:11Z","timestamp":1763921651000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-09368-4_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,24]]},"ISBN":["9783032093677","9783032093684"],"references-count":40,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-09368-4_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,24]]},"assertion":[{"value":"24 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Wuhan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/iapr.org\/icdar2025","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}