{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T11:30:34Z","timestamp":1764588634314,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62201460, 62302093"],"award-info":[{"award-number":["62201460, 62302093"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Jiangsu Province Natural Science Fund","award":["BK20230833"],"award-info":[{"award-number":["BK20230833"]}]},{"name":"Basic Research Programs of Taicang","award":["TC2023JC22"],"award-info":[{"award-number":["TC2023JC22"]}]},{"DOI":"10.13039\/501100006374","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["D5000250044, D5000250060"],"award-info":[{"award-number":["D5000250044, D5000250060"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733389","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:31:04Z","timestamp":1750876264000},"page":"1331-1339","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["MirrorDiff: Learning Mirror Diffusion for Image Captioning via Regeneration"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-9955-7838","authenticated-orcid":false,"given":"Junbo","family":"Wang","sequence":"first","affiliation":[{"name":"School of Software, Northwestern Polytechnical University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6433-7528","authenticated-orcid":false,"given":"Liangyu","family":"Fu","sequence":"additional","affiliation":[{"name":"School of Software, Northwestern Polytechnical University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6003-7739","authenticated-orcid":false,"given":"Yining","family":"Zhu","sequence":"additional","affiliation":[{"name":"School of Computer Science, Northwestern Polytechnical University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1781-1067","authenticated-orcid":false,"given":"Qiangguo","family":"Jin","sequence":"additional","affiliation":[{"name":"School of Software, Northwestern Polytechnical University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9464-1778","authenticated-orcid":false,"given":"Hongsong","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Southeast University, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9836-4600","authenticated-orcid":false,"given":"Yuke","family":"Li","sequence":"additional","affiliation":[{"name":"School of Software, Northwestern Polytechnical University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6244-0269","authenticated-orcid":false,"given":"Xuecheng","family":"Wu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Xi'an Jiaotong University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6891-8059","authenticated-orcid":false,"given":"Kun","family":"Hu","sequence":"additional","affiliation":[{"name":"School of Science, Edith Cowan University, Perth, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Spice: Semantic propositional image caption evaluation. In Computer Vision-ECCV 2016: 14th European Conference","author":"Anderson Peter","year":"2016","unstructured":"Peter Anderson, Basura Fernando, Mark Johnson, and Stephen Gould. 2016. Spice: Semantic propositional image caption evaluation. In Computer Vision-ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part V 14. Springer, 382--398."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72."},{"key":"e_1_3_2_1_4_1","volume-title":"Analog bits: Generating discrete data using diffusion models with self-conditioning. arXiv preprint arXiv:2208.04202","author":"Chen Ting","year":"2022","unstructured":"Ting Chen, Ruixiang Zhang, and Geoffrey Hinton. 2022. Analog bits: Generating discrete data using diffusion models with self-conditioning. arXiv preprint arXiv:2208.04202 (2022)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"e_1_3_2_1_6_1","volume-title":"Emu: Enhancing image generation models using photogenic needles in a haystack. arXiv preprint arXiv:2309.15807","author":"Dai Xiaoliang","year":"2023","unstructured":"Xiaoliang Dai, Ji Hou, Chih-Yao Ma, Sam Tsai, Jialiang Wang, Rui Wang, Peizhao Zhang, Simon Vandenhende, Xiaofang Wang, Abhimanyu Dubey, et al. 2023. Emu: Enhancing image generation models using photogenic needles in a haystack. arXiv preprint arXiv:2309.15807 (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171--4186."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01748"},{"volume-title":"Fast image caption generation with position alignment. arXiv preprint arXiv:1912.06365","year":"2019","key":"e_1_3_2_1_10_1","unstructured":"Zheng-cong Fei. 2019. Fast image caption generation with position alignment. arXiv preprint arXiv:1912.06365 (2019)."},{"key":"e_1_3_2_1_11_1","volume-title":"Masked non-autoregressive image captioning. arXiv preprint arXiv:1906.00717","author":"Gao Junlong","year":"2019","unstructured":"Junlong Gao, Xi Meng, Shiqi Wang, Xia Li, Shanshe Wang, Siwei Ma, and Wen Gao. 2019. Masked non-autoregressive image captioning. arXiv preprint arXiv:1906.00717 (2019)."},{"key":"e_1_3_2_1_12_1","volume-title":"Non-autoregressive image captioning with counterfactuals-critical multi-agent learning. arXiv preprint arXiv:2005.04690","author":"Guo Longteng","year":"2020","unstructured":"Longteng Guo, Jing Liu, Xinxin Zhu, Xingjian He, Jie Jiang, and Hanqing Lu. 2020. Non-autoregressive image captioning with counterfactuals-critical multi-agent learning. arXiv preprint arXiv:2005.04690 (2020)."},{"key":"e_1_3_2_1_13_1","volume-title":"Diffcap: Exploring continuous diffusion on image captioning. arXiv preprint arXiv:2305.12144","author":"He Yufeng","year":"2023","unstructured":"Yufeng He, Zefan Cai, Xu Gan, and Baobao Chang. 2023. Diffcap: Exploring continuous diffusion on image captioning. arXiv preprint arXiv:2305.12144 (2023)."},{"key":"e_1_3_2_1_14_1","volume-title":"Image captioning: Transforming objects into words. Advances in neural information processing systems","author":"Herdade Simao","year":"2019","unstructured":"Simao Herdade, Armin Kappeler, Kofi Boakye, and Joao Soares. 2019. Image captioning: Transforming objects into words. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_15_1","volume-title":"Ronan Le Bras, and Yejin Choi","author":"Hessel Jack","year":"2021","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan Le Bras, and Yejin Choi. 2021. Clipscore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)."},{"key":"e_1_3_2_1_16_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems, Vol. 33 (2020), 6840--6851."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_18_1","volume-title":"Backpropagation applied to handwritten zip code recognition. Neural computation","author":"LeCun Yann","year":"1989","unstructured":"Yann LeCun, Bernhard Boser, John S Denker, Donnie Henderson, Richard E Howard, Wayne Hubbard, and Lawrence D Jackel. 1989. Backpropagation applied to handwritten zip code recognition. Neural computation, Vol. 1, 4 (1989), 541--551."},{"key":"e_1_3_2_1_19_1","volume-title":"International conference on machine learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022a. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_20_1","first-page":"4328","article-title":"Diffusion-lm improves controllable text generation","volume":"35","author":"Li Xiang","year":"2022","unstructured":"Xiang Li, John Thickstun, Ishaan Gulrajani, Percy S Liang, and Tatsunori B Hashimoto. 2022c. Diffusion-lm improves controllable text generation. Advances in Neural Information Processing Systems, Vol. 35 (2022), 4328--4343.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_21_1","volume-title":"Oscar: Object-semantics aligned pre-training for vision-language tasks. In Computer Vision-ECCV 2020: 16th European Conference","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, et al. 2020. Oscar: Object-semantics aligned pre-training for vision-language tasks. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XXX 16. Springer, 121--137."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i10.17034"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01746"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01278"},{"key":"e_1_3_2_1_25_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_27_1","volume-title":"Prefix-diffusion: A lightweight diffusion model for diverse image captioning. arXiv preprint arXiv:2309.04965","author":"Liu Guisheng","year":"2023","unstructured":"Guisheng Liu, Yi Li, Zhengcong Fei, Haiyan Fu, Xiangyang Luo, and Yanqing Guo. 2023. Prefix-diffusion: A lightweight diffusion model for diverse image captioning. arXiv preprint arXiv:2309.04965 (2023)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02237"},{"key":"e_1_3_2_1_29_1","volume-title":"Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734","author":"Mokady Ron","year":"2021","unstructured":"Ron Mokady, Amir Hertz, and Amit H Bermano. 2021. Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734 (2021)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3551581"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318."},{"key":"e_1_3_2_1_33_1","volume-title":"Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952","author":"Podell Dustin","year":"2023","unstructured":"Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas M\u00fcller, Joe Penna, and Robin Rombach. 2023. Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_35_1","volume-title":"Learning representations by back-propagating errors. nature","author":"Rumelhart David E","year":"1986","unstructured":"David E Rumelhart, Geoffrey E Hinton, and Ronald J Williams. 1986. Learning representations by back-propagating errors. nature, Vol. 323, 6088 (1986), 533--536."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73016-0_6"},{"key":"e_1_3_2_1_37_1","volume-title":"Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502","author":"Song Jiaming","year":"2020","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2020. Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)."},{"key":"e_1_3_2_1_38_1","volume-title":"Attention is all you need. Advances in Neural Information Processing Systems","author":"Vaswani A","year":"2017","unstructured":"A Vaswani. 2017. Attention is all you need. Advances in Neural Information Processing Systems (2017)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_1_41_1","volume-title":"Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100","author":"Wang Jianfeng","year":"2022","unstructured":"Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, and Lijuan Wang. 2022. Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100 (2022)."},{"key":"e_1_3_2_1_42_1","volume-title":"LaDiC: Are Diffusion Models Really Inferior to Autoregressive Counterparts for Image-to-Text Generation? arXiv preprint arXiv:2404.10763","author":"Wang Yuchi","year":"2024","unstructured":"Yuchi Wang, Shuhuai Ren, Rundong Gao, Linli Yao, Qingyan Guo, Kaikai An, Jianhong Bai, and Xu Sun. 2024. LaDiC: Are Diffusion Models Really Inferior to Autoregressive Counterparts for Image-to-Text Generation? arXiv preprint arXiv:2404.10763 (2024)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3036860"},{"key":"e_1_3_2_1_44_1","volume-title":"attend and tell: Neural image caption generation with visual attention. arXiv preprint arXiv:1502.03044","author":"Show Kelvin Xu.","year":"2015","unstructured":"Kelvin Xu. 2015. Show, attend and tell: Neural image caption generation with visual attention. arXiv preprint arXiv:1502.03044 (2015)."},{"key":"e_1_3_2_1_45_1","volume-title":"Clip-diffusion-lm: Apply diffusion model on image captioning. arXiv preprint arXiv:2210.04559","author":"Xu Shitong","year":"2022","unstructured":"Shitong Xu. 2022. Clip-diffusion-lm: Apply diffusion model on image captioning. arXiv preprint arXiv:2210.04559 (2022)."},{"key":"e_1_3_2_1_46_1","volume-title":"Image captioning in the transformer age. arXiv preprint arXiv:2204.07374","author":"Xu Yang","year":"2022","unstructured":"Yang Xu, Li Li, Haiyang Xu, Songfang Huang, Fei Huang, and Jianfei Cai. 2022. Image captioning in the transformer age. arXiv preprint arXiv:2204.07374 (2022)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00220"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01094"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.559"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_42"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.524"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.503"},{"key":"e_1_3_2_1_53_1","volume-title":"Seqdiffuseq: Text diffusion with encoder-decoder transformers. arXiv preprint arXiv:2212.10325","author":"Yuan Hongyi","year":"2022","unstructured":"Hongyi Yuan, Zheng Yuan, Chuanqi Tan, Fei Huang, and Songfang Huang. 2022. Seqdiffuseq: Text diffusion with encoder-decoder transformers. arXiv preprint arXiv:2212.10325 (2022)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"e_1_3_2_1_55_1","volume-title":"Bertscore: Evaluating text generation with bert. arXiv preprint arXiv:1904.09675","author":"Zhang Tianyi","year":"2019","unstructured":"Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian Q Weinberger, and Yoav Artzi. 2019. Bertscore: Evaluating text generation with bert. arXiv preprint arXiv:1904.09675 (2019)."},{"key":"e_1_3_2_1_56_1","volume-title":"Exploring discrete diffusion models for image captioning. arXiv preprint arXiv:2211.11694","author":"Zhu Zixin","year":"2022","unstructured":"Zixin Zhu, Yixuan Wei, Jianfeng Wang, Zhe Gan, Zheng Zhang, Le Wang, Gang Hua, Lijuan Wang, Zicheng Liu, and Han Hu. 2022. Exploring discrete diffusion models for image captioning. arXiv preprint arXiv:2211.11694 (2022)."}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Chicago IL USA","acronym":"ICMR '25"},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733389","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:08:52Z","timestamp":1755749332000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733389"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":56,"alternative-id":["10.1145\/3731715.3733389","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733389","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}