{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T18:41:39Z","timestamp":1780080099643,"version":"3.54.0"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Fundamental Research Funds for the Central Universities, SCUT","award":["D2230080"],"award-info":[{"award-number":["D2230080"]}]},{"name":"CAAI-Huawei MindSpore Open Fund and the Science and Technology Planning Project of Guangdong Province","award":["2020B0101100002"],"award-info":[{"award-number":["2020B0101100002"]}]},{"name":"Guangxi Natural Science Foundation Key Project","award":["No. 2024GXNSF"],"award-info":[{"award-number":["No. 2024GXNSF"]}]},{"name":"Guangxi Natural Science Foundation","award":["2022GXNSFAA035627"],"award-info":[{"award-number":["2022GXNSFAA035627"]}]},{"name":"Open Research Fund of Guangxi Key Laboratory of Multimedia Communications and Network Technology"},{"name":"Open Research Fund of Key Laboratory of Big Data and Intelligent Robot (SCUT), Ministry of Education"},{"name":"CCF-Zhipu AI Large Model Fund"},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62276072, 62076100, and 62261003"],"award-info":[{"award-number":["62276072, 62076100, and 62261003"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Guangxi Scientific and Technological Bases and Talents Special Projects","award":["guikeAD23026230 and guikeAD23026213"],"award-info":[{"award-number":["guikeAD23026230 and guikeAD23026213"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612571","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"4949-4957","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Zero-TextCap: Zero-shot Framework for Text-based Image Captioning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-2481-5669","authenticated-orcid":false,"given":"Dongsheng","family":"Xu","sequence":"first","affiliation":[{"name":"Guangxi University, Nanning, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5482-6753","authenticated-orcid":false,"given":"Wenye","family":"Zhao","sequence":"additional","affiliation":[{"name":"Guangxi University, Nanning, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1767-789X","authenticated-orcid":false,"given":"Yi","family":"Cai","sequence":"additional","affiliation":[{"name":"South China University of Technology &amp; MOE of China, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7691-347X","authenticated-orcid":false,"given":"Qingbao","family":"Huang","sequence":"additional","affiliation":[{"name":"Guangxi University &amp; Guangxi Key Laboratory of Multimedia Communications and Network Technology, Nanning, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Spice: Semantic propositional image caption evaluation. In Computer Vision--ECCV 2016: 14th European Conference","author":"Anderson Peter","year":"2016","unstructured":"Peter Anderson, Basura Fernando, Mark Johnson, and Stephen Gould. 2016. Spice: Semantic propositional image caption evaluation. In Computer Vision--ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part V 14. Springer, 382--398."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_2_3_1","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3219861"},{"key":"e_1_3_2_2_5_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877--1901."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1423"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01748"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00306"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548150"},{"key":"e_1_3_2_2_12_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 17980--17989","author":"Hu Xiaowei","year":"2022","unstructured":"Xiaowei Hu, Zhe Gan, Jianfeng Wang, Zhengyuan Yang, Zicheng Liu, Yumao Lu, and Lijuan Wang. 2022. Scaling up vision-language pre-training for image captioning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 17980--17989."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00473"},{"key":"e_1_3_2_2_14_1","volume-title":"International Conference on Machine Learning. PMLR, 4904--4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International Conference on Machine Learning. PMLR, 4904--4916."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.11196"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_2_17_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_2_18_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318."},{"key":"e_1_3_2_2_19_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_2_20_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.5555\/3455716.3455856"},{"key":"e_1_3_2_2_22_1","volume-title":"Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems, Vol. 28 (2015)."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_44"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.421"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11990"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3512527.3531431"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01739"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_2_29_1","volume-title":"Pointer networks. Advances in neural information processing systems","author":"Vinyals Oriol","year":"2015","unstructured":"Oriol Vinyals, Meire Fortunato, and Navdeep Jaitly. 2015. Pointer networks. Advances in neural information processing systems, Vol. 28 (2015)."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413753"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00136"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3159811"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25360"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00432"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16389"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01245"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00864"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02247"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01759"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20243"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16476"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612571","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612571","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:59:05Z","timestamp":1755820745000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612571"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":42,"alternative-id":["10.1145\/3581783.3612571","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612571","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}