{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:05:59Z","timestamp":1750309559707,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T00:00:00Z","timestamp":1732147200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,11,21]]},"DOI":"10.1145\/3708657.3708748","type":"proceedings-article","created":{"date-parts":[[2025,5,29]],"date-time":"2025-05-29T11:13:43Z","timestamp":1748517223000},"page":"555-563","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Controllable and Diverse Image Captioning via Keyword-Driven Diffusion Networks"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-7774-5971","authenticated-orcid":false,"given":"Mingxuan","family":"Pan","sequence":"first","affiliation":[{"name":"Department of Electronic Engineering, Tsinghua University, Beijing, Beijing, China, and Beijing National Research Center for Information Science and Technology (BNRist), Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6629-7228","authenticated-orcid":false,"given":"Yali","family":"Li","sequence":"additional","affiliation":[{"name":"Department of Electronic Engineering, Tsinghua University, Beijing, Beijing, China, and Beijing National Research Center for Information Science and Technology (BNRist), Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6355-2978","authenticated-orcid":false,"given":"Simin","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Electronic Engineering, China Academy of Engineering Physics, Mianyang, Sichuan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7809-1932","authenticated-orcid":false,"given":"Shengjin","family":"Wang","sequence":"additional","affiliation":[{"name":"Department of Electronic Engineering, Tsinghua University, Beijing, Beijing, China, and Beijing National Research Center for Information Science and Technology (BNRist), Beijing, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,5,29]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_3_1_4_2","first-page":"65","volume-title":"Proceedings of the ACL-2005 Workshop on Intrinsic and Extrinsic Evaluation Measures for MT and\/or Summarization","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee, Alon Lavie, et\u00a0al. 2005. An automatic metric for mt evaluation with improved correlation with human judgments. In Proceedings of the ACL-2005 Workshop on Intrinsic and Extrinsic Evaluation Measures for MT and\/or Summarization. 65\u201372."},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01657"},{"key":"e_1_3_3_1_6_2","unstructured":"Ting Chen Ruixiang Zhang and Geoffrey Hinton. 2022. Analog bits: Generating discrete data using diffusion models with self-conditioning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2208.04202 (2022)."},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.323"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58601-0_42"},{"key":"e_1_3_3_1_10_2","unstructured":"Jacob Devlin Ming-Wei Chang Kenton Lee and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1810.04805 (2018)."},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"e_1_3_3_1_12_2","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et\u00a0al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.11929 (2020)."},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16219"},{"key":"e_1_3_3_1_14_2","unstructured":"Junlong Gao Xi Meng Shiqi Wang Xia Li Shanshe Wang Siwei Ma and Wen Gao. 2019. Masked non-autoregressive image captioning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1906.00717 (2019)."},{"key":"e_1_3_3_1_15_2","unstructured":"Simao Herdade Armin Kappeler Kofi Boakye and Joao Soares. 2019. Image captioning: Transforming objects into words. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"MD\u00a0Zakir Hossain Ferdous Sohel Mohd\u00a0Fairuz Shiratuddin and Hamid Laga. 2019. A comprehensive survey of deep learning for image captioning. ACM Computing Surveys (CsUR) 51 6 (2019) 1\u201336.","DOI":"10.1145\/3295748"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00473"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3478561"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"crossref","unstructured":"Annika Lindh Robert\u00a0J Ross and John\u00a0D Kelleher. 2020. Language-driven region pointer advancement for controllable image captioning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2011.14901 (2020).","DOI":"10.18653\/v1\/2020.coling-main.174"},{"key":"e_1_3_3_1_22_2","unstructured":"Jiasen Lu Dhruv Batra Devi Parikh and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02237"},{"key":"e_1_3_3_1_24_2","unstructured":"Ruotian Luo and Gregory Shakhnarovich. 2020. Analysis of diversity-accuracy tradeoff in image captioning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2002.11848 (2020)."},{"key":"e_1_3_3_1_25_2","unstructured":"Ron Mokady Amir Hertz and Amit\u00a0H Bermano. 2021. Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2111.09734 (2021)."},{"key":"e_1_3_3_1_26_2","unstructured":"Vikram Mullachery and Vishal Motwani. 2018. Image captioning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1805.09137 (2018)."},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Kishore Papineni Salim Roukos Todd Ward and W\u201cBLEU Zhu. 2001. A method for automatic evaluation of machine translation\u201d. the Proceedings of ACL-2002 ACL Philadelphia PA July 2002 (2001).","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_3_1_28_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_3_1_29_2","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et\u00a0al. 2019. Language models are unsupervised multitask learners. OpenAI blog 1 8 (2019) 9."},{"key":"e_1_3_3_1_30_2","volume-title":"Proceedings of Workshop on Text Summarization of ACL, Spain","volume":"5","author":"ROUGE Lin\u00a0CY","year":"2004","unstructured":"Lin\u00a0CY ROUGE. 2004. A package for automatic evaluation of summaries. In Proceedings of Workshop on Text Summarization of ACL, Spain, Vol.\u00a05."},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_3_1_32_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00432"},{"key":"e_1_3_3_1_36_2","unstructured":"Teng Wang Jinrui Zhang Junjie Fei Yixiao Ge Hao Zheng Yunlong Tang Zhe Li Mingqi Gao Shanshan Zhao Ying Shan et\u00a0al. 2023. Caption anything: Interactive image description with diverse multimodal controls. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.02677 (2023)."},{"key":"e_1_3_3_1_37_2","first-page":"2048","volume-title":"International conference on machine learning","author":"Xu Kelvin","year":"2015","unstructured":"Kelvin Xu, Jimmy Ba, Ryan Kiros, Kyunghyun Cho, Aaron Courville, Ruslan Salakhudinov, Rich Zemel, and Yoshua Bengio. 2015. Show, attend and tell: Neural image caption generation with visual attention. In International conference on machine learning. PMLR, 2048\u20132057."},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475179"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10094715"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00350"},{"key":"e_1_3_3_1_42_2","unstructured":"Zixin Zhu Yixuan Wei Jianfeng Wang Zhe Gan Zheng Zhang Le Wang Gang Hua Lijuan Wang Zicheng Liu and Han Hu. 2022. Exploring discrete diffusion models for image captioning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2211.11694 (2022)."}],"event":{"name":"ICCIP 2024: 2024 the 10th International Conference on Communication and Information Processing","acronym":"ICCIP 2024","location":"Lingshui Hainan China"},"container-title":["Proceedings of the 2024 10th International Conference on Communication and Information Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3708657.3708748","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3708657.3708748","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:58Z","timestamp":1750295938000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3708657.3708748"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,21]]},"references-count":41,"alternative-id":["10.1145\/3708657.3708748","10.1145\/3708657"],"URL":"https:\/\/doi.org\/10.1145\/3708657.3708748","relation":{},"subject":[],"published":{"date-parts":[[2024,11,21]]},"assertion":[{"value":"2025-05-29","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}