{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T22:56:22Z","timestamp":1770418582052,"version":"3.49.0"},"reference-count":37,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T00:00:00Z","timestamp":1770336000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T00:00:00Z","timestamp":1770336000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Front. Comput. Sci."],"published-print":{"date-parts":[[2026,8]]},"DOI":"10.1007\/s11704-025-50178-6","type":"journal-article","created":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T11:16:55Z","timestamp":1770376615000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Hacking reference-free image captioning metrics"],"prefix":"10.1007","volume":"20","author":[{"given":"Zheng","family":"Ma","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chang-Xin","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ya-Wen","family":"Ouyang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fei","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jian-Bing","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shu-Jian","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jia-Jun","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,2,6]]},"reference":[{"key":"50178_CR1","first-page":"23318","volume-title":"Proceedings of the 39th International Conference on Machine Learning","author":"P Wang","year":"2022","unstructured":"Wang P, Yang A, Men R, Lin J, Bai S, Li Z, Ma J, Zhou C, Zhou J, Yang H. OFA: unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: Proceedings of the 39th International Conference on Machine Learning. 2022, 23318\u201323340"},{"key":"50178_CR2","doi-asserted-by":"publisher","first-page":"469","DOI":"10.1007\/978-3-031-44693-1_37","volume-title":"Proceedings of the 12th National CCF Conference on Natural Language Processing and Chinese Computing","author":"Z Ma","year":"2023","unstructured":"Ma Z, Wang C, Huang B, Zhu Z, Zhang J. Bounding and filling: a fast and flexible framework for image captioning. In: Proceedings of the 12th National CCF Conference on Natural Language Processing and Chinese Computing. 2023, 469\u2013481"},{"key":"50178_CR3","doi-asserted-by":"publisher","first-page":"5038","DOI":"10.1145\/3581783.3611987","volume-title":"Proceedings of the 31st ACM International Conference on Multimedia","author":"K Cheng","year":"2023","unstructured":"Cheng K, Song W, Ma Z, Zhu W, Zhu Z, Zhang J. Beyond generic: enhancing image captioning with real-world knowledge using vision-language pre-training model. In: Proceedings of the 31st ACM International Conference on Multimedia. 2023, 5038\u20135047"},{"key":"50178_CR4","first-page":"6077","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"P Anderson","year":"2018","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L. Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2018, 6077\u20136086"},{"key":"50178_CR5","first-page":"5182","volume-title":"Proceedings of the 28th International Joint Conference on Artificial Intelligence","author":"B Shi","year":"2019","unstructured":"Shi B, Ji L, Lu P, Niu Z, Duan N. Knowledge aware semantic concept expansion for image-text matching. In: Proceedings of the 28th International Joint Conference on Artificial Intelligence. 2019, 5182\u20135189"},{"key":"50178_CR6","first-page":"4653","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"K Li","year":"2019","unstructured":"Li K, Zhang Y, Li K, Li Y, Fu Y. Visual semantic reasoning for image-text matching. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2019, 4653\u20134661"},{"key":"50178_CR7","unstructured":"Huang H, Qu Y, Liu J, Yang M, Zhao T. An empirical study of LLM-as-a-judge for LLM evaluation: Fine-tuned judge models are task-specific classifiers. 2024, arXiv preprint arXiv: 2403.02839"},{"key":"50178_CR8","first-page":"5723","volume-title":"Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics","author":"J Ao","year":"2022","unstructured":"Ao J, Wang R, Zhou L, Wang C, Ren S, Wu Y, Liu S, Ko T, Li Q, Zhang Y, Wei Z, Qian Y, Li J, Wei F. SpeechT5: unified-modal encoderdecoder pre-training for spoken language processing. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics. 2022, 5723\u20135738"},{"key":"50178_CR9","first-page":"943","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"J Liu","year":"2023","unstructured":"Liu J, Xia C S, Wang Y, Zhang L. Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation. In: Proceedings of the 37th International Conference on Neural Information Processing Systems. 2023, 943"},{"key":"50178_CR10","first-page":"8748","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"A Radford","year":"2021","unstructured":"Radford A, Kim J W, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, Krueger G, Sutskever I. Learning transferable visual models from natural language supervision. In: Proceedings of the 38th International Conference on Machine Learning. 2021, 8748\u20138763"},{"key":"50178_CR11","first-page":"2","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems","author":"J Lu","year":"2019","unstructured":"Lu J, Batra D, Parikh D, Lee S. ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Proceedings of the 33rd International Conference on Neural Information Processing Systems. 2019, 2"},{"key":"50178_CR12","first-page":"12888","volume-title":"Proceedings of the 39th International Conference on Machine Learning","author":"J Li","year":"2022","unstructured":"Li J, Li D, Xiong C, Hoi S. BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: Proceedings of the 39th International Conference on Machine Learning. 2022, 12888\u201312900"},{"key":"50178_CR13","first-page":"1516","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"H Liu","year":"2023","unstructured":"Liu H, Li C, Wu Q, Lee Y J. Visual instruction tuning. In: Proceedings of the 37th International Conference on Neural Information Processing Systems. 2023, 1516"},{"key":"50178_CR14","unstructured":"Bai J, Bai S, Yang S, Wang S, Tan S, Wang P, Lin J, Zhou C, Zhou J. Qwen-VL: a frontier large vision-language model with versatile abilities. 2023, arXiv preprint arXiv: 2308.12966"},{"key":"50178_CR15","first-page":"370","volume-title":"Proceedings of the 18th European Conference on Computer Vision","author":"L Chen","year":"2024","unstructured":"Chen L, Li J, Dong X, Zhang P, He C, Wang J, Zhao F, Lin D. ShareGPT4V: improving large multi-modal models with better captions. In: Proceedings of the 18th European Conference on Computer Vision. 2024, 370\u2013387"},{"key":"50178_CR16","first-page":"311","volume-title":"Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics","author":"K Papineni","year":"2002","unstructured":"Papineni K, Roukos S, Ward T, Zhu W J. Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics. 2002, 311\u2013318"},{"key":"50178_CR17","first-page":"4566","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"R Vedantam","year":"2015","unstructured":"Vedantam R, Zitnick C L, Parikh D. CIDEr: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2015, 4566\u20134575"},{"key":"50178_CR18","first-page":"2141","volume-title":"Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing","author":"M Jiang","year":"2019","unstructured":"Jiang M, Huang Q, Zhang L, Wang X, Zhang P, Gan Z, Diesner J, Gao J. TIGEr: text-to-image grounding for image caption evaluation. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing. 2019, 2141\u20132152"},{"key":"50178_CR19","first-page":"220","volume-title":"Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing","author":"H Lee","year":"2021","unstructured":"Lee H, Yoon S, Dernoncourt F, Bui T, Jung K. UMIC: an unreferenced metric for image captioning via contrastive learning. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing. 2021, 220\u2013226"},{"key":"50178_CR20","doi-asserted-by":"publisher","first-page":"7514","DOI":"10.18653\/v1\/2021.emnlp-main.595","volume-title":"Proceedings of 2021 Conference on Empirical Methods in Natural Language Processing","author":"J Hessel","year":"2021","unstructured":"Hessel J, Holtzman A, Forbes M, Le Bras R, Choi Y. CLIPScore: a reference-free evaluation metric for image captioning. In: Proceedings of 2021 Conference on Empirical Methods in Natural Language Processing. 2021, 7514\u20137528"},{"issue":"1","key":"50178_CR21","doi-asserted-by":"publisher","first-page":"40","DOI":"10.1111\/medu.13124","volume":"51","author":"L Varpio","year":"2017","unstructured":"Varpio L, Ajjawi R, Monrouxe L V, O\u2019Brien B C, Rees C E. Shedding the cobra effect: problematising thematic emergence, triangulation, saturation and member checking. Medical Education, 2017, 51(1): 40\u201350.","journal-title":"Medical Education"},{"key":"50178_CR22","first-page":"65","volume-title":"Proceedings of the ACL workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization","author":"S Banerjee","year":"2005","unstructured":"Banerjee S, Lavie A. METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the ACL workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 2005, 65\u201372"},{"key":"50178_CR23","first-page":"74","volume-title":"Proceedings of Text Summarization Branches Out","author":"C Y Lin","year":"2004","unstructured":"Lin C Y. ROUGE: a package for automatic evaluation of summaries. In: Proceedings of Text Summarization Branches Out. 2004, 74\u201381"},{"key":"50178_CR24","first-page":"382","volume-title":"Proceedings of the 14th European Conference on Computer Vision","author":"P Anderson","year":"2016","unstructured":"Anderson P, Fernando B, Johnson M, Gould S. SPICE: semantic propositional image caption evaluation. In: Proceedings of the 14th European Conference on Computer Vision. 2016, 382\u2013398"},{"key":"50178_CR25","first-page":"3171","volume-title":"Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics","author":"A Hu","year":"2023","unstructured":"Hu A, Chen S, Zhang L, Jin Q. InfoMetIC: an informative metric for reference-free image caption evaluation. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics. 2023, 3171\u20133185"},{"key":"50178_CR26","volume-title":"Proceedings of the 11th International Conference on Learning Representations","author":"M Yuksekgonul","year":"2023","unstructured":"Yuksekgonul M, Bianchi F, Kalluri P, Jurafsky D, Zou J. When and why vision-language models behave like bags-of-words, and what to do about it?. In: Proceedings of the 11th International Conference on Learning Representations. 2023"},{"key":"50178_CR27","first-page":"5739","volume-title":"Proceedings of Findings of the Association for Computational Linguistics","author":"Z Ma","year":"2022","unstructured":"Ma Z, Zong S, Pan M, Zhang J, Huang S, Dai X, Chen J. Probing cross-modal semantics alignment capability from the textual perspective. In: Proceedings of Findings of the Association for Computational Linguistics. 2022, 5739\u20135749"},{"key":"50178_CR28","first-page":"517","volume-title":"Proceedings of Findings of the Association for Computational Linguistics","author":"J Cho","year":"2022","unstructured":"Cho J, Yoon S, Kale A, Dernoncourt F, Bui T, Bansal M. Finegrained image captioning with CLIP reward. In: Proceedings of Findings of the Association for Computational Linguistics. 2022, 517\u2013527"},{"key":"50178_CR29","first-page":"4171","volume-title":"Proceedings of 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","author":"J Devlin","year":"2019","unstructured":"Devlin J, Chang M W, Lee K, Toutanova K. BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. 2019, 4171\u20134186"},{"key":"50178_CR30","first-page":"1179","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"S J Rennie","year":"2017","unstructured":"Rennie S J, Marcheret E, Mroueh Y, Ross J, Goel V. Self-critical sequence training for image captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2017, 1179\u20131195"},{"key":"50178_CR31","first-page":"104","volume-title":"Proceedings of 16th European Conference on Computer Vision","author":"Y C Chen","year":"2020","unstructured":"Chen Y C, Li L, Yu L, El Kholy A, Ahmed F, Gan Z, Cheng Y, Liu J. UNITER: UNiversal image-TExt representation learning. In: Proceedings of 16th European Conference on Computer Vision. 2020, 104\u2013120"},{"key":"50178_CR32","first-page":"1001","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"Y Lu","year":"2023","unstructured":"Lu Y, Yang X, Li X, Wang X E, Wang W Y. LLMScore: unveiling the power of large language models in text-to-image synthesis evaluation. In: Proceedings of the 37th International Conference on Neural Information Processing Systems. 2023, 1001"},{"key":"50178_CR33","unstructured":"Chen X, Fang H, Lin T Y, Vedantam R, Gupta S, Dollar P, Zitnick C L. Microsoft COCO captions: data collection and evaluation server. 2015, arXiv preprint arXiv: 1504.00325"},{"key":"50178_CR34","first-page":"3128","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"A Karpathy","year":"2015","unstructured":"Karpathy A, Li F F. Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2015, 3128\u20133137"},{"key":"50178_CR35","first-page":"6000","volume-title":"Proceedings of the 31st International Conference on Neural Information Processing Systems","author":"A Vaswani","year":"2017","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez A N, Kaiser \u0141, Polosukhin I. Attention is all you need. In: Proceedings of the 31st International Conference on Neural Information Processing Systems. 2017, 6000\u20136010"},{"issue":"1","key":"50178_CR36","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna R, Zhu Y, Groth O, Johnson J, Hata K, Kravitz J, Chen S, Kalantidis Y, Li L J, Shamma D A, Bernstein M S, Li F F. Visual genome: connecting language and vision using crowdsourced dense image annotations. International Journal of Computer Vision, 2017, 123(1): 32\u201373.","journal-title":"International Journal of Computer Vision"},{"key":"50178_CR37","first-page":"4188","volume-title":"Proceedings of the 24th International Conference on Artificial Intelligence","author":"M Hodosh","year":"2015","unstructured":"Hodosh M, Young P, Hockenmaier J. Framing image description as a ranking task: data, models and evaluation metrics. In: Proceedings of the 24th International Conference on Artificial Intelligence. 2015, 4188\u20134192"}],"container-title":["Frontiers of Computer Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11704-025-50178-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11704-025-50178-6","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11704-025-50178-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T11:17:03Z","timestamp":1770376623000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11704-025-50178-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,6]]},"references-count":37,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2026,8]]}},"alternative-id":["50178"],"URL":"https:\/\/doi.org\/10.1007\/s11704-025-50178-6","relation":{},"ISSN":["2095-2228","2095-2236"],"issn-type":[{"value":"2095-2228","type":"print"},{"value":"2095-2236","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,6]]},"assertion":[{"value":"23 February 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 June 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The authors declare that they have no competing interests or financial conflicts to disclose.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"2008343"}}