{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T07:59:49Z","timestamp":1771919989853,"version":"3.50.1"},"reference-count":82,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2023,12,5]],"date-time":"2023-12-05T00:00:00Z","timestamp":1701734400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,12,5]],"date-time":"2023-12-05T00:00:00Z","timestamp":1701734400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100003407","name":"Ministero dell\u2019Istruzione, dell\u2019Universit\u00e0 e della Ricerca","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003407","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2024,5]]},"DOI":"10.1007\/s11263-023-01949-w","type":"journal-article","created":{"date-parts":[[2023,12,5]],"date-time":"2023-12-05T11:01:56Z","timestamp":1701774116000},"page":"1701-1720","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Generating More Pertinent Captions by Leveraging Semantics and Style on Multi-Source Datasets"],"prefix":"10.1007","volume":"132","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9640-9385","authenticated-orcid":false,"given":"Marcella","family":"Cornia","sequence":"first","affiliation":[]},{"given":"Lorenzo","family":"Baraldi","sequence":"additional","affiliation":[]},{"given":"Giuseppe","family":"Fiameni","sequence":"additional","affiliation":[]},{"given":"Rita","family":"Cucchiara","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,12,5]]},"reference":[{"key":"1949_CR1","doi-asserted-by":"crossref","unstructured":"Agrawal, H., Desai, K., Wang, Y., Chen, X., Jain, R., Johnson, M., Batra, D., Parikh, D., Lee, S., & Anderson, P. (2019). Nocaps: Novel object captioning at scale. In Proceedings of the IEEE\/CVF international conference on computer vision.","DOI":"10.1109\/ICCV.2019.00904"},{"key":"1949_CR2","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J. B., Donahue, J., Luc, P., Miech, A., Barr, I., Hasson, Y., Lenc, K., Mensch, A., Millican, K., Reynolds, M., & Ring, R. (2022). Flamingo: A visual language model for few-shot learning. Advances in Neural Information Processing Systems, 35, 23716\u201323736.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"1949_CR3","doi-asserted-by":"crossref","unstructured":"Anderson, P., Fernando, B., Johnson, M., & Gould, S. (2016). SPICE: Semantic propositional image caption evaluation. in Proceedings of the European conference on computer vision.","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"1949_CR4","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., & Zhang, L. (2018). Bottom-up and top-down attention for image captioning and visual question answering. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"1949_CR5","unstructured":"Banerjee, S., & Lavie, A. (2005). METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the annual meeting of the association for computational linguistics workshops."},{"key":"1949_CR6","doi-asserted-by":"crossref","unstructured":"Barraco, M., Cornia, M., Cascianelli, S., Baraldi, L., & Cucchiara, R. (2022). The unreasonable effectiveness of CLIP features for image captioning: An experimental analysis. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition workshops.","DOI":"10.1109\/CVPRW56347.2022.00512"},{"key":"1949_CR7","doi-asserted-by":"crossref","unstructured":"Barraco, M., Stefanini, M., Cornia, M., Cascianelli, S., Baraldi, L., & Cucchiara, R. (2022). CaMEL: Mean teacher learning for image captioning. In Proceedings of the international conference on pattern recognition.","DOI":"10.1109\/ICPR56361.2022.9955644"},{"key":"1949_CR8","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J. D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., & Agarwal, S. (2020). Language models are few-shot learners. Advances in Neural Information Processing Systems, 33, 1877\u20131901.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"1949_CR9","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., & Soricut, R. (2021). Conceptual 12M: Pushing web-scale image-text pre-training to recognize long-tail visual concepts. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"1949_CR10","doi-asserted-by":"crossref","unstructured":"Chen, T., Zhang, Z., You, Q., Fang, C., Wang, Z., Jin, H., & Luo, J. (2018). \u201cFactual\u201d or\u201cEmotional\u201d: Stylized image captioning with adaptive learning and attention. In Proceedings of the European conference on computer vision.","DOI":"10.1007\/978-3-030-01249-6_32"},{"key":"1949_CR11","doi-asserted-by":"crossref","unstructured":"Chen, Y.C., Li, L., Yu, L., El Kholy, A., Ahmed, F., Gan, Z., Cheng, Y., & Liu, J. (2020). UNITER: Learning UNiversal image-TExt representations. In Proceedings of the European conference on computer vision.","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"1949_CR12","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., & Cucchiara, R. (2020). Meshed-memory transformer for image captioning. in Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"1949_CR13","unstructured":"Devlin, J., Chang, M.-W., Lee, K., & Toutanova, K. (2018). BERT: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the annual conference of the North American chapter of the association for computational linguistics."},{"key":"1949_CR14","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., & Uszkoreit, J., (2021). An image is worth 16x16 words: Transformers for image recognition at scale. In Proceedings of the international conference on learning representations."},{"key":"1949_CR15","doi-asserted-by":"crossref","unstructured":"Gan, C., Gan, Z., He, X., Gao, J., & Deng, L. (2017). StyleNet: Generating attractive visual captions with styles. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2017.108"},{"key":"1949_CR16","doi-asserted-by":"crossref","unstructured":"Gao, T., Fisch, A., & Chen, D. (2021). Making pretrained language models better few-shot learners. In Proceedings of the annual meeting of the association for computational linguistics.","DOI":"10.18653\/v1\/2021.acl-long.295"},{"key":"1949_CR17","unstructured":"Glorot, X., & Bengio, Y. (2010). Understanding the difficulty of training deep feedforward neural networks. In Proceedings of the international conference on artificial intelligence and statistics."},{"key":"1949_CR18","doi-asserted-by":"crossref","unstructured":"Guo, L., Liu, J., Tang, J., Li, J., Luo, W., & Lu, H. (2019). Aligning linguistic words and visual semantic units for image captioning.","DOI":"10.1145\/3343031.3350943"},{"key":"1949_CR19","doi-asserted-by":"crossref","unstructured":"Guo, L., Liu, J., Yao, P., Li, J., & Lu, H. (2019). MSCap: Multi-style image captioning with unpaired stylized text. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2019.00433"},{"key":"1949_CR20","doi-asserted-by":"crossref","unstructured":"Gurari, D., Zhao, Y., Zhang, M., & Bhattacharya, N. (2020). Captioning images taken by people who are blind. In Proceedings of the European conference on computer vision.","DOI":"10.1007\/978-3-030-58520-4_25"},{"key":"1949_CR21","doi-asserted-by":"crossref","unstructured":"Hessel, J., Holtzman, A., Forbes, M., Bras, R. L., & Choi, Y. (2021). CLIPScore: A reference-free evaluation metric for image captioning. In Proceedings of the conference on empirical methods in natural language processing.","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"1949_CR22","doi-asserted-by":"crossref","unstructured":"Hu, X., Gan, Z., Wang, J., Yang, Z., Liu, Z., Lu, Y., & Wang, L. (2022). Scaling up vision- language pre-training for image captioning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR52688.2022.01745"},{"key":"1949_CR23","doi-asserted-by":"crossref","unstructured":"Hu, X., Yin, X., Lin, K.,Wang, L., Zhang, L., Gao, J., & Liu, Z. (2020). VIVO: Visual vocabulary pre-training for novel object captioning. In Proceedings of the AAAI conference on artificial intelligence.","DOI":"10.1609\/aaai.v35i2.16249"},{"key":"1949_CR24","doi-asserted-by":"crossref","unstructured":"Huang, L.,Wang, W., Chen, J., & Wei, X.-Y. (2019). Attention on attention for image captioning. In Proceedings of the IEEE\/CVF international conference on computer vision.","DOI":"10.1109\/ICCV.2019.00473"},{"key":"1949_CR25","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.-T., Parekh, Z., Pham, H., & Duerig, T. (2021). Scaling up visual and vision-language representation learning with noisy text supervision."},{"issue":"3","key":"1949_CR26","doi-asserted-by":"publisher","first-page":"535","DOI":"10.1109\/TBDATA.2019.2921572","volume":"7","author":"J Johnson","year":"2019","unstructured":"Johnson, J., Douze, M., & J\u00e9egou, H. (2019). Billionscale similarity search with gpus. IEEE Transactions on Big Data, 7(3), 535\u2013547.","journal-title":"IEEE Transactions on Big Data"},{"key":"1949_CR27","doi-asserted-by":"crossref","unstructured":"Karpathy, A., & Fei-Fei, L. (2015). Deep visualsemantic alignments for generating image descriptions. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"1949_CR28","unstructured":"Kim, W., Son, B., & Kim, I. (2021). ViLT: Vision-and-language transformer without convolution or region supervision. In Proceedings of the international conference on machine learning."},{"key":"1949_CR29","unstructured":"Kingma, D. P., & Ba, J. (2015). Adam: A method for stochastic optimization. In Proceedings of the international conference on learning representations."},{"key":"1949_CR30","doi-asserted-by":"crossref","unstructured":"Klein, F., Mahajan, S., & Roth, S. (2021). Diverse image captioning with grounded style. In Proceeding of the DAGM German conference on pattern recognition.","DOI":"10.1007\/978-3-030-92659-5_27"},{"issue":"1","key":"1949_CR31","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., Zhu, Y., Groth, O., Johnson, J., Hata, K., Kravitz, J., & Fei-Fei, L. (2017). Visual genome: Connecting language and vision using crowdsourced dense image annotations. International Journal of Computer Vision, 123(1), 32\u201373.","journal-title":"International Journal of Computer Vision"},{"issue":"7","key":"1949_CR32","doi-asserted-by":"publisher","first-page":"1956","DOI":"10.1007\/s11263-020-01316-z","volume":"128","author":"A Kuznetsova","year":"2020","unstructured":"Kuznetsova, A., Rom, H., Alldrin, N., Uijlings, J., Krasin, I., Pont-Tuset, J., & Ferrari, V. (2020). The open images dataset V4. International Journal of Computer Vision, 128(7), 1956\u20131981.","journal-title":"International Journal of Computer Vision"},{"key":"1949_CR33","doi-asserted-by":"crossref","unstructured":"Li, G., Duan, N., Fang, Y., Gong, M., & Jiang, D. (2020a). Unicoder-VL: A universal encoder for vision and language by cross-modal pre-training. In Proceedings of the AAAI conference on artificial intelligence.","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"1949_CR34","doi-asserted-by":"crossref","unstructured":"Li, G., Zhai, Y., Lin, Z., & Zhang, Y. (2021). Similar scenes arouse similar emotions: Parallel data augmentation for stylized image captioning. In Proceedings of the ACM international conference on multimedia.","DOI":"10.1145\/3474085.3475662"},{"key":"1949_CR35","unstructured":"Li, J., Li, D., Savarese, S., & Hoi, S. (2023). BLIP- 2: Bootstrapping language-image pretraining with frozen image encoders and large language models. In Proceedings of the international conference on machine learning."},{"key":"1949_CR36","unstructured":"Li, J., Li, D., Xiong, C., & Hoi, S. (2022a). BLIP: Bootstrapping language-image pretraining for unified vision-language understanding and generation. In Proceedings of the international conference on machine learning."},{"key":"1949_CR37","doi-asserted-by":"crossref","unstructured":"Li, X., Yin, X., Li, C., Zhang, P., Hu, X., Zhang, L., Wang, L., Hu, H., Dong, L., Wei, F., & Choi, Y. (2020b). Oscar: Object-semantics aligned pre-training for vision-language tasks. In Proceedings of the European conference on computer vision.","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"1949_CR38","doi-asserted-by":"crossref","unstructured":"Li, Y., Pan, Y., Yao, T., & Mei, T. (2022b). Comprehending and ordering semantics for image captioning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR52688.2022.01746"},{"key":"1949_CR39","unstructured":"Lin, C.-Y. (2004). Rouge: A package for automatic evaluation of summaries. In Proceedings of the Annual meeting of the association for computational linguistics workshops."},{"key":"1949_CR40","doi-asserted-by":"crossref","unstructured":"Lin, T. Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Dollar, P., & Zitnick, C. L. (2014). Microsoft COCO: Common objects in context. In Proceedings of the European conference on computer vision.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"1949_CR41","unstructured":"Lu, J., Batra, D., Parikh, D., & Lee, S. (2019). ViLBERT: Pretraining task-agnostic visiolinguistic representations for vision-and- language tasks. In Advances in neural information processing systems."},{"key":"1949_CR42","doi-asserted-by":"crossref","unstructured":"Lu, J., Xiong, C., Parikh, D., & Socher, R. (2017). Knowing when to look: Adaptive attention via a visual sentinel for image captioning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2017.345"},{"key":"1949_CR43","doi-asserted-by":"crossref","unstructured":"Mathews, A., Xie, L., & He, X. (2016). SentiCap: Generating image descriptions with sentiments. In Proceedings of the AAAI conference on artificial intelligence.","DOI":"10.1609\/aaai.v30i1.10475"},{"key":"1949_CR44","doi-asserted-by":"crossref","unstructured":"Mathews, A., Xie, L., & He, X. (2018). Semstyle: Learning to generate stylised image captions using unaligned text. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2018.00896"},{"key":"1949_CR45","unstructured":"Micikevicius, P., Narang, S., Alben, J., Diamos, G., Elsen, E., Garcia, D., Ginsburg, B., Houston, M., Kuchaiev, O., Venkatesh, G., & Wu, H. (2018). Mixed precision training. in proceedings of the international conference on learning representations."},{"key":"1949_CR46","unstructured":"Mokady, R., Hertz, A., & Bermano, A. H. (2021). ClipCap: CLIP prefix for image captioning. arXiv preprint arXiv:2111.09734"},{"key":"1949_CR47","unstructured":"Ordonez, V., Kulkarni, G., & Berg, T. (2011). Im2Text: Describing images using 1 million captioned photographs. In Advances in neural information processing systems."},{"key":"1949_CR48","doi-asserted-by":"crossref","unstructured":"Pan, Y., Yao, T., Li, Y., & Mei, T. (2020). X-linear attention networks for image captioning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"1949_CR49","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., & Zhu, W.-J. (2002). BLEU: A method for automatic evaluation of machine translation. In Proceedings of the annual meeting of the association for computational linguistics.","DOI":"10.3115\/1073083.1073135"},{"key":"1949_CR50","unstructured":"Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., & Sutskever, I. (2021). Learning transferable visual models from natural language supervision."},{"key":"1949_CR51","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., & Sutskever, I. (2019). Language models are unsupervised multitask learners."},{"key":"1949_CR52","doi-asserted-by":"crossref","unstructured":"Rajbhandari, S., Rasley, J., Ruwase, O., & He, Y. (2020). ZeRO: Memory optimizations toward training trillion parameter models. In Proceedings of the international conference for high performance computing, networking, storage and analysis.","DOI":"10.1109\/SC41405.2020.00024"},{"key":"1949_CR53","doi-asserted-by":"crossref","unstructured":"Rennie, S.J., Marcheret, E., Mroueh, Y., Ross, J., & Goel, V. (2017). Self-critical sequence training for image captioning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2017.131"},{"key":"1949_CR54","unstructured":"Ridnik, T., Ben-Baruch, E., Noy, A., & Zelnik- Manor, L. (2021). ImageNet-21K pretraining for the masses. in Advances in neural information processing systems."},{"key":"1949_CR55","doi-asserted-by":"crossref","unstructured":"Sarto, S., Barraco, M., Cornia, M., Baraldi, L., & Cucchiara, R. (2023). Positive-augmented contrastive learning for image and video captioning evaluation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR52729.2023.00668"},{"key":"1949_CR56","doi-asserted-by":"crossref","unstructured":"Sarto, S., Cornia, M., Baraldi, L., & Cucchiara, R. (2022). Retrieval-augmented transformer for image captioning.","DOI":"10.1145\/3549555.3549585"},{"key":"1949_CR57","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., Beaumont, R., Vencu, R., Gordon, C., Wightman, R., Cherti, M., Coombes, T., Katta, A., Mullis, C., Wortsman, M., & Schramowski, P. (2022). LAION-5B: An open large-scale dataset for training next generation image-text models. Advances in Neural Information Processing Systems., 35, 25278\u201325294.","journal-title":"Advances in Neural Information Processing Systems."},{"key":"1949_CR58","unstructured":"Schuhmann, C., Vencu, R., Beaumont, R., Kaczmarczyk, R., Mullis, C., Katta, A., Coombes, T., Jitsev, J., & Komatsuzaki, A., (2021). LAION-400M: Open dataset of CLIP-filtered 400 million image-text pairs. In Advances in neural information processing systems."},{"key":"1949_CR59","doi-asserted-by":"crossref","unstructured":"Sennrich, R., Haddow, B., & Birch, A. (2016). Neural machine translation of rare words with subword units. In Proceedings of the annual meeting of the association for computational linguistics.","DOI":"10.18653\/v1\/P16-1162"},{"key":"1949_CR60","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., & Soricut, R. (2018). Conceptual Captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning. In Proceedings of the annual meeting of the association for computational linguistics.","DOI":"10.18653\/v1\/P18-1238"},{"key":"1949_CR61","unstructured":"Shen, S., Li, L.H., Tan, H., Bansal, M., Rohrbach, A., Chang, K.W., Yao, Z., & Keutzer, K. (2022). How much can CLIP benefit vision-and- language tasks? In Proceedings of the international conference on learning representations."},{"key":"1949_CR62","doi-asserted-by":"crossref","unstructured":"Sidorov, O., Hu, R., Rohrbach, M., & Singh, A. (2020). TextCaps: A dataset for image captioning with reading comprehension. In Proceedings of the European conference on computer vision.","DOI":"10.1007\/978-3-030-58536-5_44"},{"key":"1949_CR63","doi-asserted-by":"crossref","unstructured":"Srinivasan, K., Raman, K., Chen, J., Bendersky, M., & Najork, M. (2021). WIT: Wikipedia-based image text dataset for multimodal multilingual machine learning. In ACM SIGIR conference on research and development in information retrieval.","DOI":"10.1145\/3404835.3463257"},{"issue":"1","key":"1949_CR64","doi-asserted-by":"publisher","first-page":"539","DOI":"10.1109\/TPAMI.2022.3148210","volume":"45","author":"M Stefanini","year":"2022","unstructured":"Stefanini, M., Cornia, M., Baraldi, L., Cascianelli, S., Fiameni, G., & Cucchiara, R. (2022). From show to tell: A survey on deep learningbased image captioning. IEEE Transactions on Pattern Analysis and Machine Intelligence, 45(1), 539\u2013559.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1949_CR65","unstructured":"Su, W., Zhu, X., Cao, Y., Li, B., Lu, L., Wei, F., & Dai, J. (2020). VL-BERT: Pre-training of generic visual-linguistic representations. In Proceedings of the international conference on learning representations."},{"key":"1949_CR66","doi-asserted-by":"crossref","unstructured":"Tan, H., & Bansal, M. (2019). LXMERT: Learning cross-modality encoder representations from transformers. In Proceedings of the conference on empirical methods in natural language processing.","DOI":"10.18653\/v1\/D19-1514"},{"issue":"2","key":"1949_CR67","doi-asserted-by":"publisher","first-page":"64","DOI":"10.1145\/2812802","volume":"59","author":"B Thomee","year":"2016","unstructured":"Thomee, B., Shamma, D. A., Friedland, G., Elizalde, B., Ni, K., Poland, D., & Li, L.-J. (2016). YFCC100M: The new data in multimedia research. Communications of the ACM, 59(2), 64\u201373.","journal-title":"Communications of the ACM"},{"key":"1949_CR68","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, L., & Polosukhin, I. (2017). Attention is all you need. In Advances in neural information processing systems."},{"key":"1949_CR69","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., & Parikh, D. (2015). CIDEr: Consensus-based image description evaluation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"1949_CR70","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., & Erhan, D. (2015). Show and tell: A neural image caption generator. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"1949_CR71","unstructured":"Wang, Z., Yu, J., Yu, A.W., Dai, Z., Tsvetkov, Y., & Cao, Y. (2022). SimVLM: Simple visual language model pretraining with weak supervision. In Proceedings of the international conference on learning representations."},{"key":"1949_CR72","doi-asserted-by":"crossref","unstructured":"Xu, H., Yan, M., Li, C., Bi, B., Huang, S., Xiao, W., & Huang, F. (2021). E2E-VLP: End-to-end vision-language pre-training enhanced by visual learning. In Proceedings of the annual meeting of the association for computational linguistics.","DOI":"10.18653\/v1\/2021.acl-long.42"},{"key":"1949_CR73","unstructured":"Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A., Salakhudinov, R., Zemel, R., & Bengio, Y. (2015). Show, attend and tell: Neural image caption generation with visual attention. In Proceedings of the international conference on machine learning."},{"key":"1949_CR74","unstructured":"Yan, M., Xu, H., Li, C., Bi, B., Tian, J., Gui, M., & Wang, W. (2021). Grid-VLP: Revisiting grid features for vision-language pretraining. arXiv preprint arXiv:2108.09479"},{"key":"1949_CR75","doi-asserted-by":"crossref","unstructured":"Yang, X., Tang, K., Zhang, H., & Cai, J. (2019). Auto-encoding scene graphs for image captioning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2019.01094"},{"key":"1949_CR76","unstructured":"Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A., Salakhudinov, R., Zemel, R., & Bengio, Y. (2020). Large batch optimization for deep learning: Training BERT in 76 minutes. In Proceedings of the international conference on learning representations."},{"key":"1949_CR77","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., & Hockenmaier, J. (2014). From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. Transactions of the Association for Computational Linguistics, 2, 67\u201378.","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"1949_CR78","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., & Wu, Y. (2022). CoCa: Contrastive captioners are image- text foundation models. arXiv preprint arXiv:2205.01917"},{"key":"1949_CR79","doi-asserted-by":"crossref","unstructured":"Zhang, P., Li, X., Hu, X., Yang, J., Zhang, L., Wang, L., Choi, Y., & Gao, J. (2021). VinVL: Revisiting visual representations in vision-language models. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"1949_CR80","unstructured":"Zhang, S., Roller, S., Goyal, N., Artetxe, M., Chen, M., Chen, S., Dewan, C., Diab, M., Li, X., Lin, X. V., & Mihaylov, T. (2022). OPT: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068"},{"key":"1949_CR81","doi-asserted-by":"crossref","unstructured":"Zhao, W., Wu, X., & Zhang, X. (2020). Mem-Cap: Memorizing style knowledge for image captioning. In Proceedings of the AAAI conference on artificial intelligence.","DOI":"10.1609\/aaai.v34i07.6998"},{"key":"1949_CR82","doi-asserted-by":"crossref","unstructured":"Zhou, L., Palangi, H., Zhang, L., Hu, H., Corso, J. J., & Gao, J. (2020). Unified vision-language pre-training for image captioning and VQA. In Proceedings of the AAAI conference on artificial intelligence.","DOI":"10.1609\/aaai.v34i07.7005"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-023-01949-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-023-01949-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-023-01949-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,5,7]],"date-time":"2024-05-07T08:13:35Z","timestamp":1715069615000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-023-01949-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,5]]},"references-count":82,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2024,5]]}},"alternative-id":["1949"],"URL":"https:\/\/doi.org\/10.1007\/s11263-023-01949-w","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,12,5]]},"assertion":[{"value":"3 April 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 October 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 December 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}