{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:47:53Z","timestamp":1778082473121,"version":"3.51.4"},"publisher-location":"Cham","reference-count":44,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031705458","type":"print"},{"value":"9783031705465","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-70546-5_8","type":"book-chapter","created":{"date-parts":[[2024,9,10]],"date-time":"2024-09-10T05:02:47Z","timestamp":1725944567000},"page":"124-141","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["CICA: Content-Injected Contrastive Alignment for\u00a0Zero-Shot Document Image Classification"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-6820-3633","authenticated-orcid":false,"given":"Sankalp","family":"Sinha","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7375-807X","authenticated-orcid":false,"given":"Muhammad Saif Ullah","family":"Khan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9156-5679","authenticated-orcid":false,"given":"Talha Uddin","family":"Sheikh","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Didier","family":"Stricker","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0536-6867","authenticated-orcid":false,"given":"Muhammad Zeshan","family":"Afzal","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,9,11]]},"reference":[{"key":"8_CR1","doi-asserted-by":"crossref","unstructured":"Afzal, M.Z., et al.: DeepDocClassifier: document classification with deep convolutional neural network. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR), pp. 1111\u20131115. IEEE (2015)","DOI":"10.1109\/ICDAR.2015.7333933"},{"key":"8_CR2","doi-asserted-by":"crossref","unstructured":"Afzal, M.Z., K\u00f6lsch, A., Ahmed, S., Liwicki, M.: Cutting the error by half: investigation of very deep CNN and advanced training strategies for document image classification. In: 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR), vol.\u00a01, pp. 883\u2013888. IEEE (2017)","DOI":"10.1109\/ICDAR.2017.149"},{"key":"8_CR3","doi-asserted-by":"publisher","unstructured":"Bay, H., Tuytelaars, T., Van\u00a0Gool, L.: SURF: speeded up robust features. In: Leonardis, A., Bischof, H., Pinz, A. (eds.) Computer Vision\u2013ECCV 2006: 9th European Conference on Computer Vision, Graz, Austria, 7\u201313 May 2006, Proceedings, Part I 9, pp. 404\u2013417. Springer, Cham (2006). https:\/\/doi.org\/10.1007\/11744023_32","DOI":"10.1007\/11744023_32"},{"key":"8_CR4","unstructured":"Brendel, W., Bethge, M.: Approximating CNNs with bag-of-local-features models works surprisingly well on ImageNet. arXiv preprint arXiv:1904.00760 (2019)"},{"key":"8_CR5","doi-asserted-by":"crossref","unstructured":"Cacheux, Y.L., Borgne, H.L., Crucianu, M.: Modeling inter and intra-class relations in the triplet loss for zero-shot learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10333\u201310342 (2019)","DOI":"10.1109\/ICCV.2019.01043"},{"key":"8_CR6","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Chao, W.L., Gong, B., Sha, F.: Synthesized classifiers for zero-shot learning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5327\u20135336 (2016)","DOI":"10.1109\/CVPR.2016.575"},{"key":"8_CR7","doi-asserted-by":"publisher","unstructured":"Chao, W.L., Changpinyo, S., Gong, B., Sha, F.: An empirical study and analysis of generalized zero-shot learning for object recognition in the wild. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, 11\u201314 October 2016, Proceedings, Part II 14, pp. 52\u201368. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46475-6_4","DOI":"10.1007\/978-3-319-46475-6_4"},{"key":"8_CR8","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s10032-006-0020-2","volume":"10","author":"N Chen","year":"2007","unstructured":"Chen, N., Blostein, D.: A survey of document image classification: problem statement, classifier architecture and performance evaluation. IJDAR 10, 1\u201316 (2007)","journal-title":"IJDAR"},{"key":"8_CR9","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"key":"8_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"103","DOI":"10.1007\/978-3-030-57058-3_8","volume-title":"Document Analysis Systems","author":"A Cosma","year":"2020","unstructured":"Cosma, A., Ghidoveanu, M., Panaitescu-Liess, M., Popescu, M.: Self-supervised representation learning on document images. In: Bai, X., Karatzas, D., Lopresti, D. (eds.) DAS 2020. LNCS, vol. 12116, pp. 103\u2013117. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-57058-3_8"},{"key":"8_CR11","doi-asserted-by":"crossref","unstructured":"Das, A., Roy, S., Bhattacharya, U., Parui, S.K.: Document image classification with intra-domain transfer learning and stacked generalization of deep convolutional neural networks. In: 2018 24th International Conference on Pattern Recognition (ICPR), pp. 3180\u20133185. IEEE (2018)","DOI":"10.1109\/ICPR.2018.8545630"},{"key":"8_CR12","unstructured":"Dauphinee, T., Patel, N., Rashidi, M.: Modular multimodal architecture for document classification. arXiv preprint arXiv:1912.04376 (2019)"},{"key":"8_CR13","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16\u00a0$$\\times $$\u00a016 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"8_CR14","first-page":"21271","volume":"33","author":"JB Grill","year":"2020","unstructured":"Grill, J.B., et al.: Bootstrap your own latent-a new approach to self-supervised learning. Adv. Neural. Inf. Process. Syst. 33, 21271\u201321284 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"8_CR15","unstructured":"Gui, J., Chen, T., Cao, Q., Sun, Z., Luo, H., Tao, D.: A survey of self-supervised learning from multiple perspectives: algorithms, theory, applications and future trends. arXiv preprint arXiv:2301.05712 (2023)"},{"key":"8_CR16","doi-asserted-by":"crossref","unstructured":"Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR), pp. 991\u2013995. IEEE (2015)","DOI":"10.1109\/ICDAR.2015.7333910"},{"key":"8_CR17","doi-asserted-by":"crossref","unstructured":"Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: International Conference on Document Analysis and Recognition (ICDAR) (2015)","DOI":"10.1109\/ICDAR.2015.7333910"},{"key":"8_CR18","unstructured":"Ji, Z., Fu, Y., Guo, J., Pang, Y., Zhang, Z.M., et\u00a0al.: Stacked semantics-guided attention model for fine-grained zero-shot learning. In: Advances in Neural Information Processing Systems, vol. 31 (2018)"},{"key":"8_CR19","doi-asserted-by":"crossref","unstructured":"Jiang, H., Wang, R., Shan, S., Chen, X.: Transferable contrastive network for generalized zero-shot learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9765\u20139774 (2019)","DOI":"10.1109\/ICCV.2019.00986"},{"issue":"3","key":"8_CR20","doi-asserted-by":"publisher","first-page":"1457","DOI":"10.3390\/app12031457","volume":"12","author":"S Kanchi","year":"2022","unstructured":"Kanchi, S., Pagani, A., Mokayed, H., Liwicki, M., Stricker, D., Afzal, M.Z.: EmmDocClassifier: efficient multimodal document image classifier for scarce data. Appl. Sci. 12(3), 1457 (2022)","journal-title":"Appl. Sci."},{"key":"8_CR21","doi-asserted-by":"crossref","unstructured":"Kang, L., Kumar, J., Ye, P., Li, Y., Doermann, D.: Convolutional neural networks for document image classification. In: 2014 22nd International Conference on Pattern Recognition, pp. 3168\u20133172. IEEE (2014)","DOI":"10.1109\/ICPR.2014.546"},{"key":"8_CR22","unstructured":"Ke, Y., Sukthankar, R.: PCA-SIFT: a more distinctive representation for local image descriptors. In: Proceedings of the 2004 IEEE Computer Society Conference on Computer Vision and Pattern Recognition, 2004, CVPR 2004, vol.\u00a02, pp. II\u2013II. IEEE (2004)"},{"key":"8_CR23","doi-asserted-by":"publisher","unstructured":"Khalifa, M., Vyas, Y., Wang, S., Horwood, G., Mallya, S., Ballesteros, M.: Contrastive training improves zero-shot classification of semi-structured documents. In: Rogers, A., Boyd-Graber, J., Okazaki, N. (eds.) Findings of the Association for Computational Linguistics: ACL 2023, pp. 7499\u20137508. Association for Computational Linguistics, Toronto, Canada, July 2023. https:\/\/doi.org\/10.18653\/v1\/2023.findings-acl.473, https:\/\/aclanthology.org\/2023.findings-acl.473","DOI":"10.18653\/v1\/2023.findings-acl.473"},{"key":"8_CR24","unstructured":"Khan, M.S.U., Naeem, M.F., Tombari, F., Van\u00a0Gool, L., Stricker, D., Afzal, M.Z.: FocusClip: multimodal subject-level guidance for zero-shot transfer in human-centric tasks. arXiv preprint arXiv:2403.06904 (2024)"},{"key":"8_CR25","doi-asserted-by":"crossref","unstructured":"K\u00f6lsch, A., Afzal, M.Z., Ebbecke, M., Liwicki, M.: Real-time document image classification using deep CNN and extreme learning machines. In: 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR), vol.\u00a01, pp. 1318\u20131323. IEEE (2017)","DOI":"10.1109\/ICDAR.2017.217"},{"key":"8_CR26","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: ImageNet classification with deep convolutional neural networks. In: Advances in Neural Information Processing Systems, vol. 25 (2012)"},{"key":"8_CR27","unstructured":"Kumar, J., Ye, P., Doermann, D.: Learning document structure for retrieval and classification. In: Proceedings of the 21st International Conference on Pattern Recognition (ICPR2012), pp. 1558\u20131561. IEEE (2012)"},{"key":"8_CR28","doi-asserted-by":"publisher","first-page":"223","DOI":"10.1016\/j.neucom.2021.04.114","volume":"453","author":"L Liu","year":"2021","unstructured":"Liu, L., Wang, Z., Qiu, T., Chen, Q., Lu, Y., Suen, C.Y.: Document image classification: progress over two decades. Neurocomputing 453, 223\u2013240 (2021)","journal-title":"Neurocomputing"},{"key":"8_CR29","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1023\/B:VISI.0000029664.99615.94","volume":"60","author":"DG Lowe","year":"2004","unstructured":"Lowe, D.G.: Distinctive image features from scale-invariant keypoints. Int. J. Comput. Vision 60, 91\u2013110 (2004)","journal-title":"Int. J. Comput. Vision"},{"key":"8_CR30","doi-asserted-by":"crossref","unstructured":"Mancini, M., Naeem, M.F., Xian, Y., Akata, Z.: Open world compositional zero-shot learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5222\u20135230 (2021)","DOI":"10.1109\/CVPR46437.2021.00518"},{"issue":"10","key":"8_CR31","doi-asserted-by":"publisher","first-page":"1615","DOI":"10.1109\/TPAMI.2005.188","volume":"27","author":"K Mikolajczyk","year":"2005","unstructured":"Mikolajczyk, K., Schmid, C.: A performance evaluation of local descriptors. IEEE Trans. Pattern Anal. Mach. Intell. 27(10), 1615\u20131630 (2005)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"8_CR32","unstructured":"Mindee: doctr: Document text recognition (2021). https:\/\/github.com\/mindee\/doctr"},{"key":"8_CR33","doi-asserted-by":"crossref","unstructured":"Naeem, M.F., et al.: I2MVFormer: large language model generated multi-view document supervision for zero-shot image classification. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15169\u201315179 (2023)","DOI":"10.1109\/CVPR52729.2023.01456"},{"key":"8_CR34","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"8_CR35","unstructured":"Romera-Paredes, B., Torr, P.: An embarrassingly simple approach to zero-shot learning. In: International Conference on Machine Learning, pp. 2152\u20132161. PMLR (2015)"},{"key":"8_CR36","doi-asserted-by":"crossref","unstructured":"Sch\u00f6nfeld, E., Ebrahimi, S., Sinha, S., Darrell, T., Akata, Z.: Generalized zero-and few-shot learning via aligned variational autoencoders. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2019) (2019)","DOI":"10.1109\/CVPR.2019.00844"},{"key":"8_CR37","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1007\/978-3-030-86337-1_20","volume-title":"Document Analysis and Recognition \u2013 ICDAR 2021","author":"SA Siddiqui","year":"2021","unstructured":"Siddiqui, S.A., Dengel, A., Ahmed, S.: Analyzing the potential of zero-shot recognition for document image classification. In: Llad\u00f3s, J., Lopresti, D., Uchida, S. (eds.) ICDAR 2021. LNCS, vol. 12824, pp. 293\u2013304. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-86337-1_20"},{"key":"8_CR38","unstructured":"Smith, R.: Tesseract OCR (2016). https:\/\/github.com\/tesseract-ocr\/tesseract. Accessed 5 Jan 2023"},{"key":"8_CR39","doi-asserted-by":"crossref","unstructured":"Tensmeyer, C., Martinez, T.: Analysis of convolutional neural networks for document image classification. In: 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR), vol.\u00a01, pp. 388\u2013393. IEEE (2017)","DOI":"10.1109\/ICDAR.2017.71"},{"key":"8_CR40","doi-asserted-by":"crossref","unstructured":"Verma, V.K., Arora, G., Mishra, A., Rai, P.: Generalized zero-shot learning via synthesized examples. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4281\u20134289 (2018)","DOI":"10.1109\/CVPR.2018.00450"},{"key":"8_CR41","doi-asserted-by":"crossref","unstructured":"Xian, Y., Akata, Z., Sharma, G., Nguyen, Q., Hein, M., Schiele, B.: Latent embeddings for zero-shot classification. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 69\u201377 (2016)","DOI":"10.1109\/CVPR.2016.15"},{"issue":"9","key":"8_CR42","doi-asserted-by":"publisher","first-page":"2251","DOI":"10.1109\/TPAMI.2018.2857768","volume":"41","author":"Y Xian","year":"2018","unstructured":"Xian, Y., Lampert, C.H., Schiele, B., Akata, Z.: Zero-shot learning-a comprehensive evaluation of the good, the bad and the ugly. IEEE Trans. Pattern Anal. Mach. Intell. 41(9), 2251\u20132265 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"8_CR43","first-page":"21969","volume":"33","author":"W Xu","year":"2020","unstructured":"Xu, W., Xian, Y., Wang, J., Schiele, B., Akata, Z.: Attribute prototype network for zero-shot learning. Adv. Neural. Inf. Process. Syst. 33, 21969\u201321980 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"8_CR44","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Elhoseiny, M., Liu, B., Peng, X., Elgammal, A.: A generative adversarial approach for zero-shot learning from noisy texts. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1004\u20131013 (2018)","DOI":"10.1109\/CVPR.2018.00111"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition - ICDAR 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-70546-5_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,10]],"date-time":"2024-09-10T05:04:47Z","timestamp":1725944687000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-70546-5_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031705458","9783031705465"],"references-count":44,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-70546-5_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"11 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Athens","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Greece","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30 August 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 September 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icdar2024.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}