{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T10:31:29Z","timestamp":1761388289077,"version":"build-2065373602"},"reference-count":35,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T00:00:00Z","timestamp":1755734400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T00:00:00Z","timestamp":1755734400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1007\/s00530-025-01947-7","type":"journal-article","created":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T11:47:29Z","timestamp":1755776849000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Aesblip2: generating image aesthetic caption via prompting"],"prefix":"10.1007","volume":"31","author":[{"given":"Guanjun","family":"Sheng","sequence":"first","affiliation":[]},{"given":"Yongzhen","family":"Ke","sequence":"additional","affiliation":[]},{"given":"Shuai","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Kai","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,21]]},"reference":[{"key":"1947_CR1","unstructured":"Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A., Salakhudinov, R., Zemel, R., Bengio, Y.: Show, attend and tell: Neural image caption generation with visual attention. In: International Conference on Machine Learning, pp. 2048\u20132057. PMLR (2015)"},{"key":"1947_CR2","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: A neural image caption generator. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3156\u20133164 (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"1947_CR3","unstructured":"Chang, K.-Y., Lu, K.-H., Chen, C.-S.: Aesthetic critiques generation for photos. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 3514\u20133523 (2017)"},{"key":"1947_CR4","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Lu, X., Zhang, J., Wang, J.Z.: Joint image and text representation for aesthetics analysis. In: Proceedings of the 24th ACM International Conference on Multimedia, pp. 262\u2013266 (2016)","DOI":"10.1145\/2964284.2967223"},{"key":"1947_CR5","doi-asserted-by":"crossref","unstructured":"Jin, X., Wu, L., Zhao, G., Li, X., Zhang, X., Ge, S., Zou, D., Zhou, B., Zhou, X.: Aesthetic attributes assessment of images. In: Proceedings of the 27th ACM International Conference on Multimedia, pp. 311\u2013319 (2019)","DOI":"10.1145\/3343031.3350970"},{"key":"1947_CR6","doi-asserted-by":"crossref","unstructured":"Ghosal, K., Rana, A., Smolic, A.: Aesthetic image captioning from weakly-labelled photographs. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision Workshops (2019)","DOI":"10.1109\/ICCVW.2019.00556"},{"issue":"8","key":"1947_CR7","doi-asserted-by":"publisher","first-page":"749","DOI":"10.1049\/iet-cvi.2019.0361","volume":"13","author":"W Wang","year":"2019","unstructured":"Wang, W., Yang, S., Zhang, W., Zhang, J.: Neural aesthetic image reviewer. IET Comput. Vis. 13(8), 749\u2013758 (2019)","journal-title":"IET Comput. Vis."},{"key":"1947_CR8","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"1947_CR9","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"1947_CR10","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: Coca: contrastive captioners are image-text foundation models (2022). arXiv preprint arXiv:2205.01917"},{"key":"1947_CR11","doi-asserted-by":"crossref","unstructured":"Zhong, Z., Zhou, F., Qiu, G.: Aesthetically relevant image captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 37, pp. 3733\u20133741 (2023)","DOI":"10.1609\/aaai.v37i3.25485"},{"key":"1947_CR12","doi-asserted-by":"crossref","unstructured":"Ke, J., Ye, K., Yu, J., Wu, Y., Milanfar, P., Yang, F.: Vila: learning image aesthetics from user comments with vision-language pretraining. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10041\u201310051 (2023)","DOI":"10.1109\/CVPR52729.2023.00968"},{"key":"1947_CR13","doi-asserted-by":"crossref","unstructured":"Lester, B., Al-Rfou, R., Constant, N.: The power of scale for parameter-efficient prompt tuning (2021). arXiv preprint arXiv:2104.08691","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"issue":"9","key":"1947_CR14","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Learning to prompt for vision-language models. Int. J. Comput. Vis. 130(9), 2337\u20132348 (2022)","journal-title":"Int. J. Comput. Vis."},{"key":"1947_CR15","doi-asserted-by":"crossref","unstructured":"Yeo, Y.-Y., See, J., Wong, L.-K., Goh, H.-N.: Generating aesthetic based critique for photographs. In: 2021 IEEE International Conference on Image Processing (ICIP), pp. 2523\u20132527. IEEE (2021)","DOI":"10.1109\/ICIP42928.2021.9506385"},{"key":"1947_CR16","doi-asserted-by":"crossref","unstructured":"Sharma, D., Dhiman, C., Kumar, D.: Control with style: style embedding-based variational autoencoder for controlled stylized caption generation framework. IEEE. Trans. Cogn. Dev. Syst. (2024)","DOI":"10.1109\/TCDS.2024.3405573"},{"issue":"2","key":"1947_CR17","doi-asserted-by":"publisher","first-page":"4219","DOI":"10.1007\/s11042-023-15291-3","volume":"83","author":"D Sharma","year":"2024","unstructured":"Sharma, D., Dhiman, C., Kumar, D.: Xgl-t transformer model for intelligent image captioning. Multimed. Tools Appl. 83(2), 4219\u20134240 (2024)","journal-title":"Multimed. Tools Appl."},{"key":"1947_CR18","doi-asserted-by":"crossref","unstructured":"Chen, J., Guo, H., Yi, K., Li, B., Elhoseiny, M.: Visualgpt: Data-efficient adaptation of pretrained language models for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18030\u201318040 (2022)","DOI":"10.1109\/CVPR52688.2022.01750"},{"key":"1947_CR19","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning, pp. 19730\u201319742 (2023). PMLR"},{"key":"1947_CR20","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: Minigpt-4: Enhancing vision-language understanding with advanced large language models (2023). arXiv preprint arXiv:2304.10592"},{"key":"1947_CR21","unstructured":"Shoeybi, M., Patwary, M., Puri, R., LeGresley, P., Casper, J., Catanzaro, B.: Megatron-lm: training multi-billion parameter language models using model parallelism (2019). arXiv preprint arXiv:1909.08053"},{"issue":"9","key":"1947_CR22","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3560815","volume":"55","author":"P Liu","year":"2023","unstructured":"Liu, P., Yuan, W., Fu, J., Jiang, Z., Hayashi, H., Neubig, G.: Pre-train, prompt, and predict: a systematic survey of prompting methods in natural language processing. ACM Comput. Surv. 55(9), 1\u201335 (2023)","journal-title":"ACM Comput. Surv."},{"key":"1947_CR23","doi-asserted-by":"crossref","unstructured":"Wang, N., Xie, J., Wu, J., Jia, M., Li, L.: Controllable image captioning via prompting. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 37, pp. 2617\u20132625 (2023)","DOI":"10.1609\/aaai.v37i2.25360"},{"key":"1947_CR24","doi-asserted-by":"crossref","unstructured":"Wang, J., Chan, K.C., Loy, C.C.: Exploring clip for assessing the look and feel of images. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 37, pp. 2555\u20132563 (2023)","DOI":"10.1609\/aaai.v37i2.25353"},{"key":"1947_CR25","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Conditional prompt learning for vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16816\u201316825 (2022)","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"1947_CR26","doi-asserted-by":"crossref","unstructured":"Murray, N., Marchesotti, L., Perronnin, F.: Ava: A large-scale database for aesthetic visual analysis. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition, pp. 2408\u20132415 (2012). IEEE","DOI":"10.1109\/CVPR.2012.6247954"},{"key":"1947_CR27","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding (2018). arXiv preprint arXiv:1810.04805"},{"key":"1947_CR28","doi-asserted-by":"crossref","unstructured":"Liu, X., Zheng, Y., Du, Z., Ding, M., Qian, Y., Yang, Z., Tang, J.: Gpt understands, too. AI Open (2023)","DOI":"10.1016\/j.aiopen.2023.08.012"},{"key":"1947_CR29","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.-J.: Bleu: a method for automatic evaluation of machine translation. In: ProDceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"1947_CR30","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: Cider: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"1947_CR31","unstructured":"Lin, C.-Y.: Rouge: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"1947_CR32","doi-asserted-by":"crossref","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S.: Spice: semantic propositional image caption evaluation. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part V 14, pp. 382\u2013398 (2016). Springer","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"1947_CR33","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization (2017). arXiv preprint arXiv:1711.05101"},{"key":"1947_CR34","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., Beaumont, R., Vencu, R., Gordon, C., Wightman, R., Cherti, M., Coombes, T., Katta, A., Mullis, C., Wortsman, M., et al.: Laion-5b: an open large-scale dataset for training next generation image-text models. Adv. Neural. Inf. Process. Syst. 35, 25278\u201325294 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1947_CR35","unstructured":"Han, Z., Gao, C., Liu, J., Zhang, S.Q., et al.: Parameter-efficient fine-tuning for large models: a comprehensive survey (2024). arXiv preprint arXiv:2403.14608"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01947-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-01947-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01947-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T10:25:57Z","timestamp":1761387957000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-01947-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,21]]},"references-count":35,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2025,10]]}},"alternative-id":["1947"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-01947-7","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"type":"print","value":"0942-4962"},{"type":"electronic","value":"1432-1882"}],"subject":[],"published":{"date-parts":[[2025,8,21]]},"assertion":[{"value":"9 February 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 July 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 August 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical and informed consent for data used"}}],"article-number":"362"}}