{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T18:13:29Z","timestamp":1772907209139,"version":"3.50.1"},"reference-count":83,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2025,8,7]],"date-time":"2025-08-07T00:00:00Z","timestamp":1754524800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,7]],"date-time":"2025-08-07T00:00:00Z","timestamp":1754524800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,11]]},"DOI":"10.1007\/s11263-025-02535-y","type":"journal-article","created":{"date-parts":[[2025,8,6]],"date-time":"2025-08-06T23:03:39Z","timestamp":1754521419000},"page":"7647-7671","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Positive-Augmented Contrastive Learning for Vision-and-Language Evaluation and Training"],"prefix":"10.1007","volume":"133","author":[{"given":"Sara","family":"Sarto","sequence":"first","affiliation":[]},{"given":"Nicholas","family":"Moratelli","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9640-9385","authenticated-orcid":false,"given":"Marcella","family":"Cornia","sequence":"additional","affiliation":[]},{"given":"Lorenzo","family":"Baraldi","sequence":"additional","affiliation":[]},{"given":"Rita","family":"Cucchiara","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,7]]},"reference":[{"key":"2535_CR1","unstructured":"Aditya, S., Yang, Y., Baral, C., Fermuller, C., Aloimonos, Y. (2015). From Images to Sentences through Scene Description Graphs using Commonsense Reasoning and Knowledge. arXiv preprint arXiv:1511.03292."},{"key":"2535_CR2","doi-asserted-by":"crossref","unstructured":"Agrawal, H., Desai, K., Chen, X., Jain, R., Batra, D., Parikh, D., . . . Anderson, P. (2019). nocaps: novel object captioning at scale. Proceedings of the IEEE\/CVF International Conference on Computer Vision.","DOI":"10.1109\/ICCV.2019.00904"},{"key":"2535_CR3","doi-asserted-by":"crossref","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S. (2016). SPICE: Semantic Propositional Image Caption Evaluation. Proceedings of the European Conference on Computer Vision.","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"2535_CR4","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., Zhang, L. (2018). Bottom-up and top-down attention for image captioning and visual question answering. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"2535_CR5","unstructured":"Banerjee, S., & Lavie, A. (2005). METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. Proceedings of the Annual Meeting of the Association for Computational Linguistics Workshops."},{"key":"2535_CR6","doi-asserted-by":"crossref","unstructured":"Barraco, M., Sarto, S., Cornia, M., Baraldi, L., Cucchiara, R. (2023). With a Little Help from your own Past: Prototypical Memory Networks for Image Captioning. Proceedings of the IEEE\/CVF International Conference on Computer Vision.","DOI":"10.1109\/ICCV51070.2023.00282"},{"key":"2535_CR7","doi-asserted-by":"crossref","unstructured":"Chan, D., Petryk, S., Gonzalez, J.E., Darrell, T., Canny, J. (2023). CLAIR: Evaluating image captions with large language models. Proceedings of the Conference on Empirical Methods in Natural Language Processing.","DOI":"10.18653\/v1\/2023.emnlp-main.841"},{"key":"2535_CR8","doi-asserted-by":"crossref","unstructured":"Chen, L., Li, J., Dong, X., Zhang, P., He, C., Wang, J., . . . Lin, D. (2024). ShareGPT4V: Improving Large Multi-Modal Models with Better Captions. Proceedings of the European Conference on Computer Vision.","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"2535_CR9","unstructured":"Chen, Q., Deng, C., Wu, Q. (2022). Learning Distinct and Representative Modes for Image Captioning. Advances in Neural Information Processing Systems."},{"key":"2535_CR10","doi-asserted-by":"crossref","unstructured":"Cho, J., Yoon, S., Kale, A., Dernoncourt, F., Bui, T., Bansal, M. (2022). Fine-grained Image Captioning with CLIP Reward. Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics.","DOI":"10.18653\/v1\/2022.findings-naacl.39"},{"key":"2535_CR11","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R. (2020). Meshed-Memory Transformer for Image Captioning. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"2535_CR12","doi-asserted-by":"crossref","unstructured":"Cui, Y., Yang, G., Veit, A., Huang, X., Belongie, S. (2018). Learning to Evaluate Image Captioning. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2018.00608"},{"key":"2535_CR13","unstructured":"Dai, W., Li, J., Li, D., Tiong, A.M.H., Zhao, J., Wang, W., . . . Hoi, S. (2023). InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arXiv preprint arXiv:2305.06500."},{"key":"2535_CR14","doi-asserted-by":"crossref","unstructured":"Dess\u00ec, R., Bevilacqua, M., Gualdoni, E., Rakotonirina, N.C., Franzon, F., Baroni, M. (2023). Cross-domain image captioning with discriminative finetuning. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52729.2023.00670"},{"key":"2535_CR15","unstructured":"Dong, H., Li, J., Wu, B., Wang, J., Zhang, Y., Guo, H. (2024). Benchmarking and Improving Detail Image Caption. arXiv preprint arXiv:2405.19092"},{"key":"2535_CR16","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., . . . Houlsby, N. (2021). An Image is Worth 16x16Words: Transformers for Image Recognition at Scale. Proceedings of the International Conference on Learning Representations."},{"key":"2535_CR17","unstructured":"Dubey, A., Jauhri, A., Pandey, A., Kadian, A., Al-Dahle, A., Letman, A., . . . others (2024). The Llama 3 Herd of Models. arXiv:2407.21783"},{"key":"2535_CR18","doi-asserted-by":"crossref","unstructured":"Gurari, D., Zhao, Y., Zhang, M., Bhattacharya, N. (2020). Captioning Images Taken by People Who Are Blind. Proceedings of the European Conference on Computer Vision.","DOI":"10.1007\/978-3-030-58520-4_25"},{"key":"2535_CR19","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J. (2016). Deep residual learning for image recognition. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2016.90"},{"key":"2535_CR20","doi-asserted-by":"crossref","unstructured":"Hessel, J., Holtzman, A., Forbes, M., Bras, R.L., Choi, Y. (2021). CLIPScore: A Reference-free Evaluation Metric for Image Captioning. Proceedings of the Conference on Empirical Methods in Natural Language Processing.","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"2535_CR21","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh, M., Young, P., & Hockenmaier, J. (2013). Framing image description as a ranking task: Data, models and evaluation metrics. Journal of Artificial Intelligence Research, 47, 853\u2013899.","journal-title":"Journal of Artificial Intelligence Research"},{"key":"2535_CR22","unstructured":"Hu, E.J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., . . . Chen, W. (2021). LoRA: Low-Rank Adaptation of Large Language Models. arXiv preprint arXiv:2106.09685."},{"key":"2535_CR23","doi-asserted-by":"crossref","unstructured":"Huang, L.,Wang, W., Chen, J.,Wei, X.-Y. (2019). Attention on Attention for Image Captioning. Proceedings of the IEEE\/CVF International Conference on Computer Vision.","DOI":"10.1109\/ICCV.2019.00473"},{"key":"2535_CR24","doi-asserted-by":"crossref","unstructured":"Jiang, M., Huang, Q., Zhang, L.,Wang, X., Zhang, P., Gan, Z., . . . Gao, J. (2019). TIGEr: Text-to-Image Grounding for Image Caption Evaluation. Proceedings of the Conference on Empirical Methods in Natural Language Processing.","DOI":"10.18653\/v1\/D19-1220"},{"key":"2535_CR25","doi-asserted-by":"crossref","unstructured":"Karpathy, A., & Fei-Fei, L. (2015). Deep visualsemantic alignments for generating image descriptions. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"2535_CR26","doi-asserted-by":"crossref","unstructured":"Khandelwal, A., Weihs, L., Mottaghi, R., Kembhavi, A. (2022). Simple but Effective: CLIP Embeddings for Embodied AI. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52688.2022.01441"},{"key":"2535_CR27","unstructured":"Kim, J.-H., Kim, Y., Lee, J., Yoo, K.M., Lee, S.- W. (2022). Mutual Information Divergence: A Unified Metric for Multimodal Generative Models. Advances in Neural Information Processing Systems."},{"key":"2535_CR28","unstructured":"Kingma, D.P., & Ba, J. (2015). Adam: A Method for Stochastic Optimization. Proceedings of the International Conference on Learning Representations."},{"key":"2535_CR29","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Mintun, E., Ravi, N., Mao, H., Rolland, C., Gustafson, L., . . . others (2023). Segment anything. Proceedings of the IEEE\/CVF International Conference on Computer Vision.","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"2535_CR30","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511815829","volume-title":"Statistical machine translation","author":"P Koehn","year":"2009","unstructured":"Koehn, P. (2009). Statistical machine translation. Cambridge University Press."},{"key":"2535_CR31","unstructured":"Lauren\u00e7con, H., Marafioti, A., Sanh, V., Tronchon, L. (2024). Building and better understanding vision-language models: insights and future directions. Neurips workshops."},{"key":"2535_CR32","unstructured":"Lauren\u00e7con, H., Saulnier, L., Tronchon, L., Bekman, S., Singh, A., et al. (2023). OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents. Advances in Neural Information Processing Systems."},{"key":"2535_CR33","doi-asserted-by":"crossref","unstructured":"Lee, H., Yoon, S., Dernoncourt, F., Bui, T., Jung, K. (2021). UMIC: An Unreferenced Metric for Image Captioning via Contrastive Learning. Proceedings of the Annual Meeting of the Association for Computational Linguistics.","DOI":"10.18653\/v1\/2021.acl-short.29"},{"key":"2535_CR34","doi-asserted-by":"crossref","unstructured":"Lee, H., Yoon, S., Dernoncourt, F., Kim, D.S., Bui, T., Jung, K. (2020). ViLBERTScore: Evaluating Image Caption Using Visionand- Language BERT. Proceedings of the Conference on Empirical Methods in Natural Language Processing Workshops.","DOI":"10.18653\/v1\/2020.eval4nlp-1.4"},{"key":"2535_CR35","doi-asserted-by":"crossref","unstructured":"Lee, Y., Park, I., Kang, M. (2024). FLEUR: An Explainable Reference-Free Evaluation Metric for Image Captioning Using a Large Multimodal Model. Proceedings of the Annual Meeting of the Association for Computational Linguistics.","DOI":"10.18653\/v1\/2024.acl-long.205"},{"key":"2535_CR36","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S. (2023a). BLIP- 2: Bootstrapping Language-Image Pretraining with Frozen Image Encoders and Large Language Models. Proceedings of the International Conference on Machine Learning."},{"key":"2535_CR37","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S. (2023b). BLIP- 2: Bootstrapping Language-Image Pretraining with Frozen Image Encoders and Large Language Models. Proceedings of the International Conference on Machine Learning."},{"key":"2535_CR38","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S. (2022). BLIP: Bootstrapping Language-Image Pretraining for Unified Vision-Language Understanding and Generation. Proceedings of the International Conference on Machine Learning."},{"key":"2535_CR39","doi-asserted-by":"crossref","unstructured":"Li, X., Yin, X., Li, C., Zhang, P., Hu, X., Zhang, L., . . . others (2020). Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks. Proceedings of the European Conference on Computer Vision.","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"2535_CR40","doi-asserted-by":"crossref","unstructured":"Li, Y., Pan, Y., Yao, T., Mei, T. (2022). Comprehending and Ordering Semantics for Image Captioning. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52688.2022.01746"},{"key":"2535_CR41","unstructured":"Lin, C.-Y. (2004). Rouge: A package for automatic evaluation of summaries. Proceedings of the Annual Meeting of the Association for Computational Linguistics Workshops."},{"key":"2535_CR42","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., . . . Zitnick, C.L. (2014). Microsoft COCO: Common Objects in Context. Proceedings of the European Conference on Computer Vision.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2535_CR43","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J. (2024). Improved Baselines with Visual Instruction Tuning. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"2535_CR44","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J. (2023). Visual Instruction Tuning. Advances in Neural Information Processing Systems."},{"key":"2535_CR45","unstructured":"Loshchilov, I., & Hutter, F. (2019). Decoupled Weight Decay Regularization. Proceedings of the International Conference on Learning Representations."},{"key":"2535_CR46","doi-asserted-by":"crossref","unstructured":"Luo, Y., Ji, J., Sun, X., Cao, L., Wu, Y., Huang, F., . . . Ji, R. (2021). Dual-level Collaborative Transformer for Image Captioning. Proceedings of the AAAI Conference on Artificial Intelligence.","DOI":"10.1609\/aaai.v35i3.16328"},{"key":"2535_CR47","doi-asserted-by":"crossref","unstructured":"Materzy\u0144ska, J., Torralba, A., Bau, D. (2022). Disentangling Visual and Written Concepts in CLIP. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52688.2022.01592"},{"key":"2535_CR48","unstructured":"Mokady, R., Hertz, A., Bermano, A.H. (2021). ClipCap: CLIP Prefix for Image Captioning. arXiv preprint arXiv:2111.09734."},{"key":"2535_CR49","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O. (2018). Representation Learning with Contrastive Predictive Coding. arXiv preprint arXiv:1807.03748."},{"key":"2535_CR50","doi-asserted-by":"crossref","unstructured":"Pan, Y., Yao, T., Li, Y., Mei, T. (2020). X-Linear Attention Networks for Image Captioning. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"2535_CR51","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.-J. (2002). BLEU: a method for automatic evaluation of machine translation. Proceedings of the Annual Meeting of the Association for Computational Linguistics.","DOI":"10.3115\/1073083.1073135"},{"key":"2535_CR52","doi-asserted-by":"crossref","unstructured":"Petryk, S., Chan, D.M., Kachinthaya, A., Zou, H., Canny, J., Gonzalez, J.E., Darrell, T. (2024). ALOHa: A New Measure for Hallucination in Captioning Models. Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics.","DOI":"10.18653\/v1\/2024.naacl-short.30"},{"key":"2535_CR53","unstructured":"Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., & Sutskever, I. (2021). Learning Transferable Visual Models From Natural Language Supervision."},{"key":"2535_CR54","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M. (2022). Hierarchical Text- Conditional Image Generation with CLIP Latents. arXiv preprint arXiv:2204.06125."},{"key":"2535_CR55","doi-asserted-by":"crossref","unstructured":"Ramos, R., Martins, B., Elliott, D., Kementchedjhieva, Y. (2023). SmallCap: Lightweight Image Captioning Prompted With Retrieval Augmentation. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52729.2023.00278"},{"key":"2535_CR56","unstructured":"Rashtchian, C., Young, P., Hodosh, M., Hockenmaier, J. (2010). Collecting Image Annotations Using Amazon\u2019s Mechanical Turk. Naacl workshops."},{"key":"2535_CR57","doi-asserted-by":"crossref","unstructured":"Rennie, S.J., Marcheret, E., Mroueh, Y., Ross, J., Goel, V. (2017). Self-Critical Sequence Training for Image Captioning. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2017.131"},{"key":"2535_CR58","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Hendricks, L.A., Burns, K., Darrell, T., Saenko, K. (2018). Object Hallucination in Image Captioning. Proceedings of the Conference on Empirical Methods in Natural Language Processing.","DOI":"10.18653\/v1\/D18-1437"},{"key":"2535_CR59","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B. (2022). High-Resolution Image Synthesis With Latent Diffusion Models. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"2535_CR60","doi-asserted-by":"crossref","unstructured":"Rotstein, N., Bensa\u00efd, D., Brody, S., Ganz, R., Kimmel, R. (2024). FuseCap: Leveraging Large Language Models for Enriched Fused Image Captions. Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision.","DOI":"10.1109\/WACV57701.2024.00559"},{"key":"2535_CR61","doi-asserted-by":"crossref","unstructured":"Sarto, S., Barraco, M., Cornia, M., Baraldi, L., Cucchiara, R. (2023). Positive-Augmented Contrastive Learning for Image and Video Captioning Evaluation. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52729.2023.00668"},{"key":"2535_CR62","doi-asserted-by":"crossref","unstructured":"Sarto, S., Cornia, M., Baraldi, L., Cucchiara, R. (2024). BRIDGE: Bridging Gaps in Image Captioning Evaluation with Stronger Visual Cues. Proceedings of the European Conference on Computer Vision.","DOI":"10.1007\/978-3-031-73229-4_5"},{"key":"2535_CR63","unstructured":"Schuhmann, C., Beaumont, R., Vencu, R., Gordon, C., Wightman, R., Cherti, M., . . . Jitsev, J. (2022). LAION-5B: An open large-scale dataset for training next generation image-text models. Advances in Neural Information Processing Systems."},{"key":"2535_CR64","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R. (2018). Conceptual Captions: A Cleaned, Hypernymed, Image Alt-text Dataset For Automatic Image Captioning. Proceedings of the Annual Meeting of the Association for Computational Linguistics.","DOI":"10.18653\/v1\/P18-1238"},{"key":"2535_CR65","doi-asserted-by":"crossref","unstructured":"Shekhar, R., Pezzelle, S., Klimovich, Y., Herbelot, A., Nabi, M., Sangineto, E., Bernardi, R. (2017). FOIL it! Find One mismatch between Image and Language caption. Proceedings of the Annual Meeting of the Association for Computational Linguistics.","DOI":"10.18653\/v1\/P17-1024"},{"key":"2535_CR66","doi-asserted-by":"crossref","unstructured":"Shi, Y., Yang, X., Xu, H., Yuan, C., Li, B., Hu, W., Zha, Z.-J. (2022). EMScore: Evaluating Video Captioning via Coarse-Grained and Fine-Grained Embedding Matching. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52688.2022.01740"},{"issue":"1","key":"2535_CR67","doi-asserted-by":"publisher","first-page":"539","DOI":"10.1109\/TPAMI.2022.3148210","volume":"45","author":"M Stefanini","year":"2022","unstructured":"Stefanini, M., Cornia, M., Baraldi, L., Cascianelli, S., Fiameni, G., & Cucchiara, R. (2022). From Show to Tell: A Survey on Deep Learningbased Image Captioning. IEEE Transactions on Pattern Analysis and Machine Intelligence, 45(1), 539\u2013559.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2535_CR68","doi-asserted-by":"crossref","unstructured":"Sun, Z., Fang, Y., Wu, T., Zhang, P., Zang, Y., Kong, S., . . . Wang, J. (2024). Alpha-CLIP: A CLIP Model Focusing on Wherever You Want. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52733.2024.01237"},{"key":"2535_CR69","unstructured":"Tschannen, M., Gritsenko, A., Wang, X., Naeem, M.F., Alabdulmohsin, I., Parthasarathy, N., . . . others (2025). SigLIP 2: Multilingual Vision-Language Encoders with Improved Semantic Understanding, Localization, and Dense Features. arXiv preprint arXiv:2502.14786"},{"key":"2535_CR70","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., . . . Polosukhin, I. (2017). Attention is all you need. Advances in Neural Information Processing Systems."},{"key":"2535_CR71","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., Parikh, D. (2015). CIDEr: Consensus-based Image Description Evaluation. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"2535_CR72","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D. (2015). Show and tell: A neural image caption generator. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"2535_CR73","doi-asserted-by":"crossref","unstructured":"Wada, Y., Kaneda, K., Saito, D., Sugiura, K. (2024). Polos: Multimodal Metric Learning from Human Feedback for Image Captioning. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52733.2024.01287"},{"key":"2535_CR74","doi-asserted-by":"crossref","unstructured":"Wang, S., Yao, Z., Wang, R., Wu, Z., Chen, X. (2021). FAIEr: Fidelity and Adequacy Ensured Image Caption Evaluation. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR46437.2021.01383"},{"key":"2535_CR75","doi-asserted-by":"crossref","unstructured":"Wang, X., Wu, J., Chen, J., Li, L., Wang, Y.-F., Wang, W.Y. (2019). VaTeX: A Large- Scale, High-Quality Multilingual Dataset for Video-and-Language Research. Proceedings of the IEEE\/CVF International Conference on Computer Vision.","DOI":"10.1109\/ICCV.2019.00468"},{"key":"2535_CR76","unstructured":"Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A., Salakhutdinov, R., . . . Bengio, Y. (2015). Show, attend and tell: Neural image caption generation with visual attention. Proceedings of the International Conference on Machine Learning."},{"key":"2535_CR77","doi-asserted-by":"crossref","unstructured":"Yi, Y., Deng, H., Hu, J. (2020). Improving Image Captioning Evaluation by Considering Inter References Variance. Proceedings of the Annual Meeting of the Association for Computational Linguistics.","DOI":"10.18653\/v1\/2020.acl-main.93"},{"key":"2535_CR78","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., & Hockenmaier, J. (2014). From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. Transactions of the Association for Computational Linguistics, 2, 67\u201378.","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"2535_CR79","doi-asserted-by":"crossref","unstructured":"Zeng, Z., Sun, J., Zhang, H., Wen, T., Su, Y., Xie, Y., . . . Chen, B. (2024). HICEScore: A Hierarchical Metric for Image Captioning Evaluation. Proceedings of the ACM International Conference on Multimedia.","DOI":"10.1145\/3664647.3681358"},{"key":"2535_CR80","doi-asserted-by":"crossref","unstructured":"Zhai, X., Mustafa, B., Kolesnikov, A., Beyer, L. (2023). Sigmoid Loss for Language Image Pre-Training. Proceedings of the IEEE\/CVF International Conference on Computer Vision.","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"2535_CR81","doi-asserted-by":"crossref","unstructured":"Zhang, P., Li, X., Hu, X., Yang, J., Zhang, L., Wang, L., . . . Gao, J. (2021). VinVL: Revisiting visual representations in vision-language models. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"2535_CR82","unstructured":"Zhang, T., Kishore, V., Wu, F., Weinberger, K.Q., Artzi, Y. (2020). BERTScore: Evaluating Text Generation with BERT. Proceedings of the International Conference on Learning Representations."},{"key":"2535_CR83","doi-asserted-by":"crossref","unstructured":"Zhou, L., Kalantidis, Y., Chen, X., Corso, J.J., Rohrbach, M. (2019). Grounded Video Description. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2019.00674"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02535-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02535-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02535-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T06:28:07Z","timestamp":1762928887000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02535-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,7]]},"references-count":83,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2025,11]]}},"alternative-id":["2535"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02535-y","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,8,7]]},"assertion":[{"value":"8 October 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 July 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 August 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}