{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T15:26:36Z","timestamp":1774365996966,"version":"3.50.1"},"reference-count":32,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T00:00:00Z","timestamp":1740096000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T00:00:00Z","timestamp":1740096000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SIViP"],"published-print":{"date-parts":[[2025,4]]},"DOI":"10.1007\/s11760-025-03871-9","type":"journal-article","created":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T19:29:39Z","timestamp":1740166179000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["CDZL: a controllable diversity zero-shot image caption model using large language models"],"prefix":"10.1007","volume":"19","author":[{"given":"Xin","family":"Zhao","sequence":"first","affiliation":[]},{"given":"Weiwei","family":"Kong","sequence":"additional","affiliation":[]},{"given":"Zongyao","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Menghao","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Yiwen","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,2,21]]},"reference":[{"key":"3871_CR1","doi-asserted-by":"crossref","unstructured":"Kishore, P., Salim, R., Todd W., Wei-Jing, Z.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"3871_CR2","unstructured":"Satanjeev, B., Alon, L.: Meteor: an automatic metric for mt evaluation with improved correlation with human judgments. In: Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization, pp. 65\u201372 (2005)"},{"key":"3871_CR3","doi-asserted-by":"crossref","unstructured":"Ramakrishna, V., Lawrence, C.Z., Devi, P.: Cider: consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"3871_CR4","unstructured":"Lin, C. Y.: Rouge: a package for automatic evaluation of summaries. In: Text summarization branches out (pp. 74\u201381) (2004)"},{"key":"3871_CR5","doi-asserted-by":"crossref","unstructured":"Peter, A., Basura, F., Mark, J., Stephen, G.: Spice: semantic propositional image caption evaluation. In: European conference on computer vision, pp. 382\u2013398. Springer (2016)","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"3871_CR6","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick. C.L.: Microsoft coco: common objects in context. In European conference on computer vision, pp. 740\u2013755. Springer (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"3871_CR7","unstructured":"Yoad, T., Yoav, S., Idan, S., Lior, W.: Zerocap: zero-shot image-to-text generation for visualsemantic arithmetic. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17918\u201317928, (2022)"},{"key":"3871_CR8","doi-asserted-by":"crossref","unstructured":"Zeng, Z., et al.: Conzic: Controllable zero-shot image captioning by sampling-based polishing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. (2023)","DOI":"10.1109\/CVPR52729.2023.02247"},{"key":"3871_CR9","unstructured":"Yildirim, I.: Bayesian inference: Metropolis-hastings sampling[J]. Dept. of Brain and Cognitive Sciences, Univ. of Rochester, Rochester, NY (2012)"},{"key":"3871_CR10","unstructured":"Pengchuan, Z., Xiujun, L., Xiaowei, H., Jianwei, Y., Lei, Z., Lijuan, W., Yejin, C., Jianfeng, G.: Vinvl: revisiting visual representations in vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5579\u2013 5588 (2021)"},{"key":"3871_CR11","doi-asserted-by":"crossref","unstructured":"Fei, J., Wang, T., Zhang, J., et al.: Transferable Decoding with Visual Entities for Zero-Shot Image Captioning[J]. arXiv preprint arXiv:2307.16525 (2023)","DOI":"10.1109\/ICCV51070.2023.00291"},{"key":"3871_CR12","unstructured":"Shizhe, C., Qin, J., Peng, W., Qi ,Wu.: Say as you wish: Fine-grained control of image caption generation with abstract scene graphs. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 9962\u20139971 (2020)"},{"key":"3871_CR13","unstructured":"Chuang, G., Zhe, G., Xiaodong, H., Jianfeng, G., Li, D.: Stylenet: generating attractive visual captions with styles. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3137\u20133146 (2017)"},{"key":"3871_CR14","unstructured":"Aditya, D., Jyoti, Aneja, L.W., Alexander, G.S., David, F:. Fast, diverse and accurate image captioning guided by part-of-speech. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10695\u201310704, (2019)"},{"key":"3871_CR15","doi-asserted-by":"crossref","unstructured":"Zhao, W., Xinxiao, W., Zhang, X.: Memcap: Memorizing style knowledge for image captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, pp. 12984\u201312992 (2020)","DOI":"10.1609\/aaai.v34i07.6998"},{"key":"3871_CR16","doi-asserted-by":"crossref","unstructured":"Van-Quang, N., Masanori, S., Takayuki O.: Grit: faster and better image captioning transformer using dual visual features. In: European Conference on Computer Vision, pp. 167\u2013184. Springer (2022)","DOI":"10.1007\/978-3-031-20059-5_10"},{"key":"3871_CR17","unstructured":"Xiaowei, H., Zhe, G., Jianfeng, W., Zhengyuan, Y., Zicheng, L., Yumao, L., Lijuan, W.: Scaling up vision-language pre-training for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17980\u201317989 (2022)"},{"key":"3871_CR18","doi-asserted-by":"crossref","unstructured":"Fang, Z., Wang, J., Hu, X., Liang, L., Gan, Z., Wang, L., Yang, Y., Liu, Z.: Injecting semantic concepts into end-to-end image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18009\u201318019, (2022)","DOI":"10.1109\/CVPR52688.2022.01748"},{"key":"3871_CR19","unstructured":"Alec, R., Jong. W.K., Chris, H., Aditya, R., Gabriel, G., Sandhini, A., Girish, S., Amanda, A., Pamela, M., Jack, C. et al. Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"3871_CR20","unstructured":"Liwei, W., Alexander, S., Svetlana, L.: Diverse and accurate image description using a variational auto-encoder with an additive gaussian encoding space. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"3871_CR21","doi-asserted-by":"crossref","unstructured":"Ashwin, V., Michael. C., Ramprasaath, S., Qing, S., Stefan, L., David, C., Dhruv, B.: Diverse beam search for improved description of complexscenes. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 32 (2018)","DOI":"10.1609\/aaai.v32i1.12340"},{"key":"3871_CR22","unstructured":"Longteng, G., Jing, L., Peng, Y., Jiangwei, L., Hanqing, L.: Mscap: multi-style image captioning with unpaired stylized text. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4204\u2013 4213 (2019)"},{"issue":"9","key":"3871_CR23","first-page":"1","volume":"55","author":"P Liu","year":"2023","unstructured":"Liu, P., Yuan, W., Fu, J., Jiang, Z., Hayashi, H., Neubig, G.: Pre-train, prompt, and predict: a systematic survey of prompting methods in natural language processing. ACM Comput. Surv. 55(9), 1\u201335 (2023)","journal-title":"ACM Comput. Surv."},{"key":"3871_CR24","unstructured":"Mokady, R., Hertz, A., Bermano, A.H.: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734 (2021)"},{"key":"3871_CR25","unstructured":"Yixuan, S., Lan, T., Yahui L., Fangyu, L, Dani, Y., Wang, Y., Kong, L., Collier, N.: Language models can see: Plugging visual controls in text generation. arXiv preprint arXiv:2205.02655 (2022)"},{"key":"3871_CR26","unstructured":"Sheng, S., Liunian, H., Li, H.T., Mohit, B., Anna, R., Kai-Wei, C., Zhewei, Y., Kurt, K. How much can clip benefit vision-and-language tasks? arXiv preprint arXiv:2107.06383 (2021)"},{"key":"3871_CR27","doi-asserted-by":"crossref","unstructured":"Andrej, K., Li, F.F.: Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 3128\u20133137 (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"3871_CR28","unstructured":"OpenAI. 2021. GPT-3.5 Architecture. ChatGPT (Version 1). Retrieved from https:\/\/openai.com"},{"key":"3871_CR29","unstructured":"Jack, H., Ari, H., Maxwell, F., Ronan, L.B., Yejin, C.: Clipscore: a reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)"},{"key":"3871_CR30","unstructured":"Yisheng, X., Lijun, W., Junliang, G., Juntao, L., Min, Z., Tao, Q., Tie-yan, L.: A survey on nonautoregressive generation for neural machine translation and beyond. arXiv preprint arXiv:2204.09269 (2022)"},{"key":"3871_CR31","unstructured":"Jyoti, A., Harsh, A., Dhruv, B., Alexander, S.: Sequential latent spaces for modeling the intention during diverse image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4261\u20134270 (2019)"},{"key":"3871_CR32","unstructured":"Stefano B., Andrea E., Fabrizio, S.: Sentiwordnet 3.0: an enhanced lexical resource for sentiment analysis and opinion mining. In: Proceedings of the 7th International Conference on Language Resources and Evaluation (LREC\u201910), (2010)"}],"container-title":["Signal, Image and Video Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-025-03871-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11760-025-03871-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-025-03871-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,2]],"date-time":"2025-04-02T01:07:08Z","timestamp":1743556028000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11760-025-03871-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,21]]},"references-count":32,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2025,4]]}},"alternative-id":["3871"],"URL":"https:\/\/doi.org\/10.1007\/s11760-025-03871-9","relation":{},"ISSN":["1863-1703","1863-1711"],"issn-type":[{"value":"1863-1703","type":"print"},{"value":"1863-1711","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2,21]]},"assertion":[{"value":"25 December 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 August 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 January 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 February 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical Approval"}}],"article-number":"324"}}