{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:06:31Z","timestamp":1760832391453,"version":"build-2065373602"},"publisher-location":"Singapore","reference-count":39,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819534586","type":"print"},{"value":"9789819534593","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-3459-3_38","type":"book-chapter","created":{"date-parts":[[2025,10,18]],"date-time":"2025-10-18T02:22:40Z","timestamp":1760754160000},"page":"482-495","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Image Captioning via\u00a0Masked Conditional Diffusion"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-5369-6627","authenticated-orcid":false,"given":"Jiayi","family":"Zhou","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8784-8148","authenticated-orcid":false,"given":"Chen","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5141-2457","authenticated-orcid":false,"given":"Huidong","family":"Tang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1716-3028","authenticated-orcid":false,"given":"Sayaka","family":"Kamei","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3046-8689","authenticated-orcid":false,"given":"Shuai","family":"Jiang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7130-2864","authenticated-orcid":false,"given":"Yasuhiko","family":"Morimoto","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,10,19]]},"reference":[{"key":"38_CR1","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"382","DOI":"10.1007\/978-3-319-46454-1_24","volume-title":"Computer Vision \u2013 ECCV 2016","author":"P Anderson","year":"2016","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S.: SPICE: semantic propositional image caption evaluation. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9909, pp. 382\u2013398. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_24"},{"key":"38_CR2","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"38_CR3","first-page":"17981","volume":"34","author":"J Austin","year":"2021","unstructured":"Austin, J., Johnson, D.D., Ho, J., Tarlow, D., Van Den Berg, R.: Structured denoising diffusion models in discrete state-spaces. Adv. Neural. Inf. Process. Syst. 34, 17981\u201317993 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"38_CR4","unstructured":"Banerjee, S., Lavie, A.: Meteor: An automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures For Machine Translation and\/or Summarization, pp. 65\u201372 (2005)"},{"key":"38_CR5","unstructured":"Chen, T., ZHANG, R., Hinton, G.: Analog bits: generating discrete data using diffusion models with self-conditioning. In: The Eleventh International Conference on Learning Representations (2022)"},{"key":"38_CR6","doi-asserted-by":"crossref","unstructured":"Chen, T.H., Liao, Y.H., Chuang, C.Y., Hsu, W.T., Fu, J., Sun, M.: Show, adapt and tell: adversarial training of cross-domain image captioner. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 521\u2013530 (2017)","DOI":"10.1109\/ICCV.2017.64"},{"key":"38_CR7","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R.: Meshed-memory transformer for image captioning. In: Proceedings of the IEEE\/CVF Conference On Computer Vision and Pattern Recognition, pp. 10578\u201310587 (2020)","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"38_CR8","doi-asserted-by":"crossref","unstructured":"Donahue, J., et al.: Long-term recurrent convolutional networks for visual recognition and description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2625\u20132634 (2015)","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"38_CR9","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12873\u201312883 (2021)","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"38_CR10","doi-asserted-by":"crossref","unstructured":"Fei, Z.: Iterative back modification for faster image captioning. In: Proceedings of the 28th ACM International Conference on Multimedia pp. 3182\u20133190 (2020)","DOI":"10.1145\/3394171.3413901"},{"key":"38_CR11","doi-asserted-by":"crossref","unstructured":"Graves, A., Graves, A.: Long short-term memory. Supervised sequence labelling with recurrent neural networks, pp. 37\u201345 (2012)","DOI":"10.1007\/978-3-642-24797-2_4"},{"key":"38_CR12","doi-asserted-by":"crossref","unstructured":"Gu, S., et al.: Vector quantized diffusion model for text-to-image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10696\u201310706 (2022)","DOI":"10.1109\/CVPR52688.2022.01043"},{"key":"38_CR13","doi-asserted-by":"crossref","unstructured":"Hessel, J., Holtzman, A., Forbes, M., Bras, R.L., Choi, Y.: CLIPScore: a reference-free evaluation metric for image captioning. In: EMNLP (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"38_CR14","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"38_CR15","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. In: NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications (2021)"},{"key":"38_CR16","doi-asserted-by":"crossref","unstructured":"Huang, L., Wang, W., Chen, J., Wei, X.Y.: Attention on attention for image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4634\u20134643 (2019)","DOI":"10.1109\/ICCV.2019.00473"},{"key":"38_CR17","unstructured":"Kornblith, S., Li, L., Wang, Z., Nguyen, T.: Classifier-free guidance makes image captioning models more descriptive. In: ICLR 2023 Workshop on Multimodal Representation Learning: Perks and Pitfalls (2023)"},{"key":"38_CR18","doi-asserted-by":"crossref","unstructured":"Li, J., Vo, D.M., Sugimoto, A., Nakayama, H.: Evcap: retrieval-augmented image captioning with external visual-name memory for open-world comprehension. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13733\u201313742 (2024)","DOI":"10.1109\/CVPR52733.2024.01303"},{"key":"38_CR19","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"38_CR20","unstructured":"Lin, C.Y.: Rouge: A package for automatic evaluation of summaries. In: Text summarization branches out, pp. 74\u201381 (2004)"},{"key":"38_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"38_CR22","first-page":"34892","volume":"36","author":"H Liu","year":"2023","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. Adv. Neural. Inf. Process. Syst. 36, 34892\u201334916 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"38_CR23","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"38_CR24","doi-asserted-by":"crossref","unstructured":"Luo, J., et al.: Semantic-conditional diffusion networks for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23359\u201323368 (2023)","DOI":"10.1109\/CVPR52729.2023.02237"},{"key":"38_CR25","unstructured":"Mao, J., Xu, W., Yang, Y., Wang, J., Yuille, A.L.: Explain images with multimodal recurrent neural networks. arXiv preprint arXiv:1410.1090 (2014)"},{"key":"38_CR26","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"38_CR27","unstructured":"Radford, A., , et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"38_CR28","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"38_CR29","unstructured":"Song, Y., Ermon, S.: Generative modeling by estimating gradients of the data distribution. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"38_CR30","unstructured":"Van Den\u00a0Oord, A., Vinyals, O., et\u00a0al.: Neural discrete representation learning. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"38_CR31","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"38_CR32","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: Cider: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"38_CR33","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: Proceedings of the IEEE Conference on Computer Vision And Pattern Recognition, pp. 3156\u20133164 (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"38_CR34","unstructured":"Wah, C., Branson, S., Welinder, P., Perona, P., Belongie, S.: The caltech-ucsd birds-200-2011 dataset. Tech. Rep. CNS-TR-2011-001, California Institute of Technology (2011)"},{"key":"38_CR35","doi-asserted-by":"crossref","unstructured":"Yan, X., Fei, Z., Li, Z., Wang, S., Huang, Q., Tian, Q.: Semi-autoregressive image captioning. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 2708\u20132716 (2021)","DOI":"10.1145\/3474085.3475179"},{"key":"38_CR36","doi-asserted-by":"crossref","unstructured":"Yu, H., Liu, Y., Qi, B., Hu, Z., Liu, H.: End-to-end non-autoregressive image captioning. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10095338"},{"issue":"1","key":"38_CR37","first-page":"9","volume":"8","author":"X Zhang","year":"2019","unstructured":"Zhang, X., Li, C., Morimoto, Y.: A multi-factor approach for stock price prediction by using recurrent neural networks. Bull. Network., Comput., Syste. Softw. 8(1), 9\u201313 (2019)","journal-title":"Bull. Network., Comput., Syste. Softw."},{"key":"38_CR38","doi-asserted-by":"crossref","unstructured":"Zhang, X., et al.: Rstnet: Captioning with adaptive attention on visual and non-visual words. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15465\u201315474 (2021)","DOI":"10.1109\/CVPR46437.2021.01521"},{"key":"38_CR39","unstructured":"Zhu, Z., et al.: Exploring discrete diffusion models for image captioning. arXiv preprint arXiv:2211.11694 (2022)"}],"container-title":["Lecture Notes in Computer Science","Advanced Data Mining and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-3459-3_38","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,18]],"date-time":"2025-10-18T02:22:54Z","timestamp":1760754174000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-3459-3_38"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"ISBN":["9789819534586","9789819534593"],"references-count":39,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-3459-3_38","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,10,19]]},"assertion":[{"value":"19 October 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ADMA","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Advanced Data Mining and Applications","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kyoto","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"22 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"adma2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/adma2025.github.io\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}