{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T23:13:27Z","timestamp":1769123607892,"version":"3.49.0"},"publisher-location":"Cham","reference-count":42,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030687953","type":"print"},{"value":"9783030687960","type":"electronic"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-68796-0_36","type":"book-chapter","created":{"date-parts":[[2021,2,20]],"date-time":"2021-02-20T16:28:24Z","timestamp":1613838504000},"page":"502-516","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":18,"title":["Iconographic Image Captioning for Artworks"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5330-1259","authenticated-orcid":false,"given":"Eva","family":"Cetinic","sequence":"first","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,2,21]]},"reference":[{"key":"36_CR1","doi-asserted-by":"crossref","unstructured":"Baraldi, L., Cornia, M., Grana, C., Cucchiara, R.: Aligning text and document illustrations: towards visually explainable digital humanities. In: 2018 24th International Conference on Pattern Recognition (ICPR), pp. 1097\u20131102. IEEE (2018)","DOI":"10.1109\/ICPR.2018.8545064"},{"key":"36_CR2","doi-asserted-by":"crossref","unstructured":"Bongini, P., Becattini, F., Bagdanov, A.D., Del Bimbo, A.: Visual question answering for cultural heritage. arXiv preprint arXiv:2003.09853 (2020)","DOI":"10.1088\/1757-899X\/949\/1\/012074"},{"key":"36_CR3","series-title":"Communications in Computer and Information Science","doi-asserted-by":"publisher","first-page":"105","DOI":"10.1007\/978-3-030-39905-4_11","volume-title":"Digital Libraries: The Era of Big Data and Data Science","author":"G Castellano","year":"2020","unstructured":"Castellano, G., Vessio, G.: Towards a tool for visual link retrieval and knowledge discovery in painting datasets. In: Ceci, M., Ferilli, S., Poggi, A. (eds.) IRCDL 2020. CCIS, vol. 1177, pp. 105\u2013110. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-39905-4_11"},{"key":"36_CR4","doi-asserted-by":"publisher","first-page":"107","DOI":"10.1016\/j.eswa.2018.07.026","volume":"114","author":"E Cetinic","year":"2018","unstructured":"Cetinic, E., Lipic, T., Grgic, S.: Fine-tuning convolutional neural networks for fine art classification. Expert Syst. Appl. 114, 107\u2013118 (2018)","journal-title":"Expert Syst. Appl."},{"key":"36_CR5","doi-asserted-by":"publisher","first-page":"73694","DOI":"10.1109\/ACCESS.2019.2921101","volume":"7","author":"E Cetinic","year":"2019","unstructured":"Cetinic, E., Lipic, T., Grgic, S.: A deep learning perspective on beauty, sentiment, and remembrance of art. IEEE Access 7, 73694\u201373710 (2019)","journal-title":"IEEE Access"},{"key":"36_CR6","doi-asserted-by":"publisher","first-page":"56","DOI":"10.1016\/j.patrec.2019.11.008","volume":"129","author":"E Cetinic","year":"2020","unstructured":"Cetinic, E., Lipic, T., Grgic, S.: Learning the principles of art history with convolutional neural networks. Pattern Recogn. Lett. 129, 56\u201362 (2020)","journal-title":"Pattern Recogn. Lett."},{"key":"36_CR7","unstructured":"Chen, Y.C., et al.: UNITER: learning universal image-text representations. arXiv preprint arXiv:1909.11740 (2019)"},{"key":"36_CR8","doi-asserted-by":"publisher","first-page":"166","DOI":"10.1016\/j.patrec.2019.11.018","volume":"129","author":"M Cornia","year":"2020","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Corsini, M., Cucchiara, R.: Explaining digital humanities by aligning images and textual descriptions. Pattern Recogn. Lett. 129, 166\u2013172 (2020)","journal-title":"Pattern Recogn. Lett."},{"issue":"2","key":"36_CR9","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1017\/S0307472200003436","volume":"8","author":"LD Couprie","year":"1983","unstructured":"Couprie, L.D.: Iconclass: an iconographic classification system. Art Libr. J. 8(2), 32\u201349 (1983)","journal-title":"Art Libr. J."},{"key":"36_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"54","DOI":"10.1007\/978-3-319-16178-5_4","volume-title":"Computer Vision - ECCV 2014 Workshops","author":"EJ Crowley","year":"2015","unstructured":"Crowley, E.J., Zisserman, A.: In search of art. In: Agapito, L., Bronstein, M.M., Rother, C. (eds.) ECCV 2014. LNCS, vol. 8925, pp. 54\u201370. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-16178-5_4"},{"key":"36_CR11","doi-asserted-by":"crossref","unstructured":"Deng, Y., Tang, F., Dong, W., Ma, C., Huang, F., Deussen, O., Xu, C.: Exploring the representativity of art paintings. IEEE Trans. Multimed. (2020)","DOI":"10.1109\/TMM.2020.3016887"},{"key":"36_CR12","doi-asserted-by":"crossref","unstructured":"Denkowski, M., Lavie, A.: Meteor Universal: language specific translation evaluation for any target language. In: Proceedings of the Ninth Workshop on Statistical Machine Translation, pp. 376\u2013380 (2014)","DOI":"10.3115\/v1\/W14-3348"},{"key":"36_CR13","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"36_CR14","doi-asserted-by":"crossref","unstructured":"Elgammal, A., Liu, B., Kim, D., Elhoseiny, M., Mazzone, M.: The shape of art history in the eyes of the machine. In: 32nd AAAI Conference on Artificial Intelligence, AAAI 2018, pp. 2183\u20132191. AAAI press (2018)","DOI":"10.1609\/aaai.v32i1.11894"},{"key":"36_CR15","doi-asserted-by":"crossref","unstructured":"Garcia, N., Vogiatzis, G.: How to read paintings: semantic art understanding with multi-modal retrieval. In: Proceedings of the European Conference on Computer Vision (ECCV) (2018)","DOI":"10.1007\/978-3-030-11012-3_52"},{"key":"36_CR16","unstructured":"Garcia, N., et al.: A dataset and baselines for visual question answering on art. arXiv preprint arXiv:2008.12520 (2020)"},{"key":"36_CR17","unstructured":"Gupta, J., Madhu, P., Kosti, R., Bell, P., Maier, A., Christlein, V.: Towards image caption generation for art historical data. In: AI Methods for Digital Heritage, Workshop at KI2020 43rd German Conference on Artificial Intelligence (2020)"},{"key":"36_CR18","doi-asserted-by":"crossref","unstructured":"Hayn-Leichsenring, G.U., Lehmann, T., Redies, C.: Subjective ratings of beauty and aesthetics: correlations with statistical image properties in western oil paintings. i-Perception 8(3), 2041669517715474 (2017)","DOI":"10.1177\/2041669517715474"},{"key":"36_CR19","doi-asserted-by":"crossref","unstructured":"Jenicek, T., Chum, O.: Linking art through human poses. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1338\u20131345. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00216"},{"issue":"1","key":"36_CR20","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vision 123(1), 32\u201373 (2017)","journal-title":"Int. J. Comput. Vision"},{"key":"36_CR21","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"36_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"36_CR23","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: VilBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Advances in Neural Information Processing Systems, pp. 13\u201323 (2019)"},{"key":"36_CR24","doi-asserted-by":"crossref","unstructured":"Madhu, P., Kosti, R., M\u00fchrenberg, L., Bell, P., Maier, A., Christlein, V.: Recognizing characters in art history using deep learning. In: Proceedings of the 1st Workshop on Structuring and Understanding of Multimedia heritAge Contents, pp. 15\u201322 (2019)","DOI":"10.1145\/3347317.3357242"},{"key":"36_CR25","unstructured":"Panofsky, E.: Studies in Iconology. Humanistic Themes in the Art of the Renaissance. Harper and Row, New York (1972)"},{"key":"36_CR26","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"36_CR27","unstructured":"Posthumus, E.: Brill Iconclass AI test set (2020)"},{"key":"36_CR28","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: Advances in Neural Information Processing Systems, pp. 91\u201399 (2015)"},{"key":"36_CR29","doi-asserted-by":"publisher","first-page":"41770","DOI":"10.1109\/ACCESS.2019.2907986","volume":"7","author":"C Sandoval","year":"2019","unstructured":"Sandoval, C., Pirogova, E., Lech, M.: Two-stage deep learning approach to the classification of fine-art paintings. IEEE Access 7, 41770\u201341781 (2019)","journal-title":"IEEE Access"},{"issue":"2","key":"36_CR30","doi-asserted-by":"publisher","first-page":"283","DOI":"10.3390\/heritage3020017","volume":"3","author":"G Sargentis","year":"2020","unstructured":"Sargentis, G., Dimitriadis, P., Koutsoyiannis, D., et al.: Aesthetical issues of leonardo da vinci\u2019s and pablo picasso\u2019s paintings with stochastic evaluation. Heritage 3(2), 283\u2013305 (2020)","journal-title":"Heritage"},{"key":"36_CR31","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"753","DOI":"10.1007\/978-3-319-46604-0_52","volume-title":"Computer Vision \u2013 ECCV 2016 Workshops","author":"B Seguin","year":"2016","unstructured":"Seguin, B., Striolo, C., diLenardo, I., Kaplan, F.: Visual link retrieval in a database of paintings. In: Hua, G., J\u00e9gou, H. (eds.) ECCV 2016. LNCS, vol. 9913, pp. 753\u2013767. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46604-0_52"},{"key":"36_CR32","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2556\u20132565 (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"36_CR33","doi-asserted-by":"crossref","unstructured":"Shen, X., Efros, A.A., Aubry, M.: Discovering visual patterns in art collections with spatially-consistent feature learning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 9278\u20139287 (2019)","DOI":"10.1109\/CVPR.2019.00950"},{"key":"36_CR34","doi-asserted-by":"crossref","unstructured":"Sheng, S., Moens, M.F.: Generating captions for images of ancient artworks. In: Proceedings of the 27th ACM International Conference on Multimedia, pp. 2478\u20132486 (2019)","DOI":"10.1145\/3343031.3350972"},{"key":"36_CR35","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"729","DOI":"10.1007\/978-3-030-30645-8_66","volume-title":"Image Analysis and Processing \u2013 ICIAP 2019","author":"M Stefanini","year":"2019","unstructured":"Stefanini, M., Cornia, M., Baraldi, L., Corsini, M., Cucchiara, R.: Artpedia: a new visual-semantic dataset with visual and contextual sentences in the artistic domain. In: Ricci, E., Rota Bul\u00f2, S., Snoek, C., Lanz, O., Messelodi, S., Sebe, N. (eds.) ICIAP 2019. LNCS, vol. 11752, pp. 729\u2013740. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-30645-8_66"},{"key":"36_CR36","unstructured":"Strezoski, G., Worring, M.: OmniArt: a large-scale artistic benchmark. ACM Trans. Multimed. Comput. Commun. Appl. (TOMM) 14(4), 1\u201321 (2018)"},{"key":"36_CR37","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: LXMERT: learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490 (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"36_CR38","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., Parikh, D.: CIDEr: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"36_CR39","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: A neural image caption generator. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3156\u20133164 (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"36_CR40","doi-asserted-by":"crossref","unstructured":"Xia, Q., et al.: XGPT: cross-modal generative pre-training for image captioning. arXiv preprint arXiv:2003.01473 (2020)","DOI":"10.1007\/978-3-030-88480-2_63"},{"key":"36_CR41","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguist. 2, 67\u201378 (2014)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"36_CR42","doi-asserted-by":"crossref","unstructured":"Zhou, L., Palangi, H., Zhang, L., Hu, H., Corso, J.J., Gao, J.: Unified vision-language pre-training for image captioning and VQA. In: AAAI, pp. 13041\u201313049 (2020)","DOI":"10.1609\/aaai.v34i07.7005"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition. ICPR International Workshops and Challenges"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-68796-0_36","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,18]],"date-time":"2022-12-18T09:20:34Z","timestamp":1671355234000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-030-68796-0_36"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030687953","9783030687960"],"references-count":42,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-68796-0_36","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"21 February 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10 January 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 January 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ICPR2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.icpr2020.it\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}