{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,18]],"date-time":"2026-05-18T10:18:12Z","timestamp":1779099492684,"version":"3.51.4"},"publisher-location":"Cham","reference-count":39,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031250552","type":"print"},{"value":"9783031250569","type":"electronic"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-25056-9_18","type":"book-chapter","created":{"date-parts":[[2023,2,14]],"date-time":"2023-02-14T12:09:56Z","timestamp":1676376596000},"page":"268-281","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":18,"title":["Is GPT-3 All You Need for\u00a0Visual Question Answering in\u00a0Cultural Heritage?"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8217-6266","authenticated-orcid":false,"given":"Pietro","family":"Bongini","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2537-2700","authenticated-orcid":false,"given":"Federico","family":"Becattini","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1052-8322","authenticated-orcid":false,"given":"Alberto","family":"Del Bimbo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,2,15]]},"reference":[{"key":"18_CR1","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"18_CR2","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"18_CR3","doi-asserted-by":"publisher","unstructured":"Asprino, L., Bulla, L., Marinucci, L., Mongiov\u00ec, M., Presutti, V.: A large visual question answering dataset for cultural heritage. In: International Conference on Machine Learning, Optimization, and Data Science, pp. 193\u2013197. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-95470-3_14","DOI":"10.1007\/978-3-030-95470-3_14"},{"key":"18_CR4","doi-asserted-by":"publisher","first-page":"325","DOI":"10.1016\/j.patrec.2021.09.008","volume":"151","author":"S Barra","year":"2021","unstructured":"Barra, S., Bisogni, C., De Marsico, M., Ricciardi, S.: Visual question answering: which investigated applications? Pattern Recogn. Lett. 151, 325\u2013331 (2021)","journal-title":"Pattern Recogn. Lett."},{"key":"18_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"781","DOI":"10.1007\/978-3-319-48496-9_62","volume-title":"Digital Heritage. Progress in Cultural Heritage: Documentation, Preservation, and Protection","author":"F Becattini","year":"2016","unstructured":"Becattini, F., Ferracani, A., Landucci, L., Pezzatini, D., Uricchio, T., Del Bimbo, A.: Imaging Novecento. A mobile app for automatic recognition of artworks and transfer of artistic styles. In: Ioannides, M., et al. (eds.) EuroMed 2016. LNCS, vol. 10058, pp. 781\u2013791. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-48496-9_62"},{"key":"18_CR6","doi-asserted-by":"crossref","unstructured":"Bongini, P., Becattini, F., Bagdanov, A.D., Del Bimbo, A.: Visual question answering for cultural heritage. In: IOP Conference Series: Materials Science and Engineering, vol. 949, p. 012074. IOP Publishing (2020)","DOI":"10.1088\/1757-899X\/949\/1\/012074"},{"key":"18_CR7","unstructured":"Brown, T.B., et al.: Language models are few-shot learners (2020)"},{"key":"18_CR8","doi-asserted-by":"crossref","unstructured":"Cetinic, E., She, J.: Understanding and creating art with AI: review and outlook. ACM Trans. Multimedia Comput. Commun. Appl. (TOMM) 18(2), 1\u201322 (2022)","DOI":"10.1145\/3475799"},{"key":"18_CR9","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R.: Meshed-memory transformer for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10578\u201310587 (2020)","DOI":"10.1109\/CVPR42600.2020.01059"},{"issue":"1","key":"18_CR10","doi-asserted-by":"publisher","first-page":"113","DOI":"10.1017\/S1351324920000601","volume":"27","author":"R Dale","year":"2021","unstructured":"Dale, R.: GPT-3: what\u2019s it good for? Nat. Lang. Eng. 27(1), 113\u2013118 (2021)","journal-title":"Nat. Lang. Eng."},{"key":"18_CR11","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"issue":"2","key":"18_CR12","doi-asserted-by":"publisher","first-page":"17212","DOI":"10.22148\/001c.17212","volume":"5","author":"K Elkins","year":"2020","unstructured":"Elkins, K., Chun, J.: Can GPT-3 pass a writer\u2019s Turing test? J. Cult. Analytics 5(2), 17212 (2020)","journal-title":"J. Cult. Analytics"},{"key":"18_CR13","doi-asserted-by":"publisher","first-page":"102","DOI":"10.1016\/j.patrec.2020.02.017","volume":"133","author":"M Fiorucci","year":"2020","unstructured":"Fiorucci, M., Khoroshiltseva, M., Pontil, M., Traviglia, A., Del Bue, A., James, S.: Machine learning for cultural heritage: a survey. Pattern Recogn. Lett. 133, 102\u2013108 (2020)","journal-title":"Pattern Recogn. Lett."},{"key":"18_CR14","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"92","DOI":"10.1007\/978-3-030-66096-3_8","volume-title":"Computer Vision \u2013 ECCV 2020 Workshops","author":"N Garcia","year":"2020","unstructured":"Garcia, N., et al.: A dataset and baselines for visual question answering on art. In: Bartoli, A., Fusiello, A. (eds.) ECCV 2020. LNCS, vol. 12536, pp. 92\u2013108. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-66096-3_8"},{"issue":"1","key":"18_CR15","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1109\/MSP.2017.2749125","volume":"35","author":"J Han","year":"2018","unstructured":"Han, J., Zhang, D., Cheng, G., Liu, N., Xu, D.: Advanced deep-learning techniques for salient and category-specific object detection: a survey. IEEE Signal Process. Mag. 35(1), 84\u2013100 (2018)","journal-title":"IEEE Signal Process. Mag."},{"key":"18_CR16","doi-asserted-by":"publisher","first-page":"56","DOI":"10.1016\/j.neunet.2017.12.005","volume":"99","author":"SR Kheradpisheh","year":"2018","unstructured":"Kheradpisheh, S.R., Ganjtabesh, M., Thorpe, S.J., Masquelier, T.: STDP-based spiking deep convolutional neural networks for object recognition. Neural Netw. 99, 56\u201367 (2018)","journal-title":"Neural Netw."},{"key":"18_CR17","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Li","year":"2020","unstructured":"Li, X., et al.: Oscar: object-semantics aligned pre-training for vision-language tasks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 121\u2013137. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8"},{"key":"18_CR18","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"18_CR19","doi-asserted-by":"crossref","unstructured":"Liu, S., Zhu, Z., Ye, N., Guadarrama, S., Murphy, K.: Improved image captioning via policy gradient optimization of spider. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 873\u2013881 (2017)","DOI":"10.1109\/ICCV.2017.100"},{"key":"18_CR20","unstructured":"Lu, J., Yang, J., Batra, D., Parikh, D.: Hierarchical question-image co-attention for visual question answering. In: Advances in Neural Information Processing Systems, pp. 289\u2013297 (2016)"},{"key":"18_CR21","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"18_CR22","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et al.: Improving language understanding by generative pre-training (2018)"},{"issue":"8","key":"18_CR23","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al.: Language models are unsupervised multitask learners. OpenAI Blog 1(8), 9 (2019)","journal-title":"OpenAI Blog"},{"key":"18_CR24","doi-asserted-by":"crossref","unstructured":"Rajpurkar, P., Zhang, J., Lopyrev, K., Liang, P.: SQuAD: 100,000+ questions for machine comprehension of text. arXiv preprint arXiv:1606.05250 (2016)","DOI":"10.18653\/v1\/D16-1264"},{"key":"18_CR25","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: Advances in Neural Information Processing Systems, pp. 91\u201399 (2015)"},{"issue":"5","key":"18_CR26","doi-asserted-by":"publisher","first-page":"513","DOI":"10.1016\/0306-4573(88)90021-0","volume":"24","author":"G Salton","year":"1988","unstructured":"Salton, G., Buckley, C.: Term-weighting approaches in automatic text retrieval. Inf. Process. Manage. 24(5), 513\u2013523 (1988)","journal-title":"Inf. Process. Manage."},{"key":"18_CR27","unstructured":"Sanh, V., Debut, L., Chaumond, J., Wolf, T.: DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)"},{"key":"18_CR28","doi-asserted-by":"crossref","unstructured":"Seidenari, L., Galteri, L., Bongini, P., Bertini, M., Del Bimbo, A.: Language based image quality assessment. In: ACM Multimedia Asia, pp. 1\u20137 (2021)","DOI":"10.1145\/3469877.3490605"},{"key":"18_CR29","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"94","DOI":"10.1007\/978-3-030-15719-7_12","volume-title":"Advances in Information Retrieval","author":"S Sheng","year":"2019","unstructured":"Sheng, S., Laenen, K., Moens, M.-F.: Can image captioning help passage retrieval in multimodal question answering? In: Azzopardi, L., Stein, B., Fuhr, N., Mayr, P., Hauff, C., Hiemstra, D. (eds.) ECIR 2019. LNCS, vol. 11438, pp. 94\u2013101. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-15719-7_12"},{"key":"18_CR30","doi-asserted-by":"crossref","unstructured":"Shih, K.J., Singh, S., Hoiem, D.: Where to look: focus regions for visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4613\u20134621 (2016)","DOI":"10.1109\/CVPR.2016.499"},{"key":"18_CR31","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"729","DOI":"10.1007\/978-3-030-30645-8_66","volume-title":"Image Analysis and Processing \u2013 ICIAP 2019","author":"M Stefanini","year":"2019","unstructured":"Stefanini, M., Cornia, M., Baraldi, L., Corsini, M., Cucchiara, R.: Artpedia: a new visual-semantic dataset with visual and contextual sentences in the artistic domain. In: Ricci, E., Rota Bul\u00f2, S., Snoek, C., Lanz, O., Messelodi, S., Sebe, N. (eds.) ICIAP 2019. LNCS, vol. 11752, pp. 729\u2013740. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-30645-8_66"},{"key":"18_CR32","unstructured":"Su, W., et al.: VL-BERT: pre-training of generic visual-linguistic representations. arXiv preprint arXiv:1908.08530 (2019)"},{"issue":"3","key":"18_CR33","doi-asserted-by":"publisher","first-page":"169","DOI":"10.1007\/s11633-022-1331-6","volume":"19","author":"TX Sun","year":"2022","unstructured":"Sun, T.X., Liu, X.Y., Qiu, X.P., Huang, X.J.: Paradigm shift in natural language processing. Mach. Intell. Res. 19(3), 169\u2013183 (2022)","journal-title":"Mach. Intell. Res."},{"key":"18_CR34","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: LXMERT: learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490 (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"18_CR35","doi-asserted-by":"crossref","unstructured":"Vannoni, F., Bongini, P., Becattini, F., Bagdanov, A.D., Bimbo, A.: Data collection for contextual and visual question answering in the cultural heritage domain (2020)","DOI":"10.1088\/1757-899X\/949\/1\/012074"},{"key":"18_CR36","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 5998\u20136008 (2017)"},{"key":"18_CR37","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., Parikh, D.: CIDEr: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"18_CR38","unstructured":"Wang, P., et al.: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. arXiv preprint arXiv:2202.03052 (2022)"},{"key":"18_CR39","doi-asserted-by":"crossref","unstructured":"Yang, Z., et al.: An empirical study of GPT-3 for few-shot knowledge-based VQA. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, pp. 3081\u20133089 (2022)","DOI":"10.1609\/aaai.v36i3.20215"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-25056-9_18","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,12]],"date-time":"2024-03-12T18:31:14Z","timestamp":1710268274000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-25056-9_18"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031250552","9783031250569"],"references-count":39,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-25056-9_18","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"15 February 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"From the workshops, 367 reviewed full papers have been selected for publication","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}