{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:01:03Z","timestamp":1765310463357,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["D5000250044, D5000250060"],"award-info":[{"award-number":["D5000250044, D5000250060"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100017596","name":"Natural Science Basic Research Program of Shaanxi Province","doi-asserted-by":"publisher","award":["2025JC-YBQN-882, 2025JC-YBQN-805"],"award-info":[{"award-number":["2025JC-YBQN-882, 2025JC-YBQN-805"]}],"id":[{"id":"10.13039\/501100017596","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62302093"],"award-info":[{"award-number":["62302093"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Jiangsu Province Natural Science Fund","award":["BK20230833"],"award-info":[{"award-number":["BK20230833"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755156","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:30:51Z","timestamp":1761377451000},"page":"3693-3701","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["DSACap: Enhancing Visual-Semantic Alignment with Diffusion-based Framework for Image Captioning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-6433-7528","authenticated-orcid":false,"given":"Liangyu","family":"Fu","sequence":"first","affiliation":[{"name":"School of Software, Northwestern Polytechnical University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9955-7838","authenticated-orcid":false,"given":"Junbo","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Software, Northwestern Polytechnical University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9836-4600","authenticated-orcid":false,"given":"Yuke","family":"Li","sequence":"additional","affiliation":[{"name":"School of Software, Northwestern Polytechnical University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1781-1067","authenticated-orcid":false,"given":"Qiangguo","family":"Jin","sequence":"additional","affiliation":[{"name":"School of Software, Northwestern Polytechnical University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9464-1778","authenticated-orcid":false,"given":"Hongsong","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Engineering and Computer Science, Southeast University, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4179-8210","authenticated-orcid":false,"given":"Jing","family":"Ya","sequence":"additional","affiliation":[{"name":"Beijing University of Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9701-6487","authenticated-orcid":false,"given":"Linjiang","family":"Huang","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8637-0760","authenticated-orcid":false,"given":"Liang","family":"Yao","sequence":"additional","affiliation":[{"name":"Sun Yat-Sen University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0623-584X","authenticated-orcid":false,"given":"Jiangbin","family":"Zheng","sequence":"additional","affiliation":[{"name":"School of Software, Northwestern Polytechnical University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6244-0269","authenticated-orcid":false,"given":"Xuecheng","family":"Wu","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8043-0312","authenticated-orcid":false,"given":"Zhiyong","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computer Science, The University of Sydney, Sydney, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Spice: Semantic propositional image caption evaluation","author":"Anderson Peter","year":"2016","unstructured":"Peter Anderson, Basura Fernando, Mark Johnson, and Stephen Gould. 2016. Spice: Semantic propositional image caption evaluation. In ECCV. Springer, 382-398."},{"key":"e_1_3_2_2_2_1","first-page":"6077","article-title":"Bottom-up and top-down attention for image captioning and visual question answering","author":"Anderson Peter","year":"2018","unstructured":"Peter Anderson, Xiaodong He, Chris Buehler, Damien Teney, Mark Johnson, Stephen Gould, and Lei Zhang. 2018. Bottom-up and top-down attention for image captioning and visual question answering. In CVPR. 6077-6086.","journal-title":"CVPR."},{"key":"e_1_3_2_2_3_1","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65-72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65-72."},{"key":"e_1_3_2_2_4_1","volume-title":"Analog bits: Generating discrete data using diffusion models with self-conditioning. arXiv preprint arXiv:2208.04202","author":"Chen Ting","year":"2022","unstructured":"Ting Chen, Ruixiang Zhang, and Geoffrey Hinton. 2022. Analog bits: Generating discrete data using diffusion models with self-conditioning. arXiv preprint arXiv:2208.04202 (2022)."},{"key":"e_1_3_2_2_5_1","first-page":"10578","article-title":"Meshed-memory transformer for image captioning","author":"Cornia Marcella","year":"2020","unstructured":"Marcella Cornia, Matteo Stefanini, Lorenzo Baraldi, and Rita Cucchiara. 2020. Meshed-memory transformer for image captioning. In CVPR. 10578-10587.","journal-title":"CVPR."},{"key":"e_1_3_2_2_6_1","volume-title":"Emu: Enhancing image generation models using photogenic needles in a haystack. arXiv preprint arXiv:2309.15807","author":"Dai Xiaoliang","year":"2023","unstructured":"Xiaoliang Dai, Ji Hou, Chih-Yao Ma, Sam Tsai, Jialiang Wang, Rui Wang, Peizhao Zhang, Simon Vandenhende, Xiaofang Wang, Abhimanyu Dubey, et al., 2023. Emu: Enhancing image generation models using photogenic needles in a haystack. arXiv preprint arXiv:2309.15807 (2023)."},{"key":"e_1_3_2_2_7_1","first-page":"2625","article-title":"Long-term recurrent convolutional networks for visual recognition and description","author":"Donahue Jeffrey","year":"2015","unstructured":"Jeffrey Donahue, Lisa Anne Hendricks, Sergio Guadarrama, Marcus Rohrbach, Subhashini Venugopalan, Kate Saenko, and Trevor Darrell. 2015. Long-term recurrent convolutional networks for visual recognition and description. In CVPR. 2625-2634.","journal-title":"CVPR."},{"key":"e_1_3_2_2_8_1","first-page":"18009","article-title":"Injecting semantic concepts into end-to-end image captioning","author":"Fang Zhiyuan","year":"2022","unstructured":"Zhiyuan Fang, Jianfeng Wang, Xiaowei Hu, Lin Liang, Zhe Gan, Lijuan Wang, Yezhou Yang, and Zicheng Liu. 2022. Injecting semantic concepts into end-to-end image captioning. In CVPR. 18009-18019.","journal-title":"CVPR."},{"volume-title":"Fast image caption generation with position alignment. arXiv preprint arXiv:1912.06365","year":"2019","key":"e_1_3_2_2_9_1","unstructured":"Zheng-cong Fei. 2019. Fast image caption generation with position alignment. arXiv preprint arXiv:1912.06365 (2019)."},{"key":"e_1_3_2_2_10_1","volume-title":"Masked non-autoregressive image captioning. arXiv preprint arXiv:1906.00717","author":"Gao Junlong","year":"2019","unstructured":"Junlong Gao, Xi Meng, Shiqi Wang, Xia Li, Shanshe Wang, Siwei Ma, and Wen Gao. 2019. Masked non-autoregressive image captioning. arXiv preprint arXiv:1906.00717 (2019)."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"e_1_3_2_2_12_1","volume-title":"Non-autoregressive image captioning with counterfactuals-critical multi-agent learning. arXiv preprint arXiv:2005.04690","author":"Guo Longteng","year":"2020","unstructured":"Longteng Guo, Jing Liu, Xinxin Zhu, Xingjian He, Jie Jiang, and Hanqing Lu. 2020. Non-autoregressive image captioning with counterfactuals-critical multi-agent learning. arXiv preprint arXiv:2005.04690 (2020)."},{"key":"e_1_3_2_2_13_1","volume-title":"Diffcap: Exploring continuous diffusion on image captioning. arXiv preprint arXiv:2305.12144","author":"He Yufeng","year":"2023","unstructured":"Yufeng He, Zefan Cai, Xu Gan, and Baobao Chang. 2023. Diffcap: Exploring continuous diffusion on image captioning. arXiv preprint arXiv:2305.12144 (2023)."},{"key":"e_1_3_2_2_14_1","volume-title":"NeurIPS","volume":"32","author":"Herdade Simao","year":"2019","unstructured":"Simao Herdade, Armin Kappeler, Kofi Boakye, and Joao Soares. 2019. Image captioning: Transforming objects into words. NeurIPS, Vol. 32 (2019)."},{"key":"e_1_3_2_2_15_1","volume-title":"Ronan Le Bras, and Yejin Choi","author":"Hessel Jack","year":"2021","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan Le Bras, and Yejin Choi. 2021. Clipscore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)."},{"key":"e_1_3_2_2_16_1","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. NeurIPS, Vol. 33 (2020), 6840-6851.","journal-title":"NeurIPS"},{"key":"e_1_3_2_2_17_1","first-page":"3128","article-title":"Deep visual-semantic alignments for generating image descriptions","author":"Karpathy Andrej","year":"2015","unstructured":"Andrej Karpathy and Li Fei-Fei. 2015. Deep visual-semantic alignments for generating image descriptions. In CVPR. 3128-3137.","journal-title":"CVPR."},{"key":"e_1_3_2_2_18_1","unstructured":"Diederik P Kingma Max Welling et al. 2013. Auto-encoding variational bayes."},{"key":"e_1_3_2_2_19_1","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022a. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML. PMLR, 12888-12900.","journal-title":"ICML. PMLR"},{"key":"e_1_3_2_2_20_1","first-page":"4328","article-title":"Diffusion-lm improves controllable text generation","volume":"35","author":"Li Xiang","year":"2022","unstructured":"Xiang Li, John Thickstun, Ishaan Gulrajani, Percy S Liang, and Tatsunori B Hashimoto. 2022c. Diffusion-lm improves controllable text generation. NeurIPS, Vol. 35 (2022), 4328-4343.","journal-title":"NeurIPS"},{"key":"e_1_3_2_2_21_1","volume-title":"Oscar: Object-semantics aligned pre-training for vision-language tasks","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, et al., 2020. Oscar: Object-semantics aligned pre-training for vision-language tasks. In ECCV. Springer, 121-137."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i10.17034"},{"key":"e_1_3_2_2_23_1","first-page":"17990","article-title":"Comprehending and ordering semantics for image captioning","author":"Li Yehao","year":"2022","unstructured":"Yehao Li, Yingwei Pan, Ting Yao, and Tao Mei. 2022b. Comprehending and ordering semantics for image captioning. In CVPR. 17990-17999.","journal-title":"CVPR."},{"key":"e_1_3_2_2_24_1","first-page":"12497","article-title":"Pointing novel objects in image captioning","author":"Li Yehao","year":"2019","unstructured":"Yehao Li, Ting Yao, Yingwei Pan, Hongyang Chao, and Tao Mei. 2019. Pointing novel objects in image captioning. In CVPR. 12497-12506.","journal-title":"CVPR."},{"key":"e_1_3_2_2_25_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74-81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74-81."},{"volume-title":"Microsoft coco: Common objects in context","author":"Lin Tsung-Yi","key":"e_1_3_2_2_26_1","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In ECCV. Springer, 740-755."},{"key":"e_1_3_2_2_27_1","volume-title":"O2NA: An object-oriented non-autoregressive approach for controllable video captioning. arXiv preprint arXiv:2108.02359","author":"Liu Fenglin","year":"2021","unstructured":"Fenglin Liu, Xuancheng Ren, Xian Wu, Bang Yang, Shen Ge, Yuexian Zou, and Xu Sun. 2021. O2NA: An object-oriented non-autoregressive approach for controllable video captioning. arXiv preprint arXiv:2108.02359 (2021)."},{"key":"e_1_3_2_2_28_1","volume-title":"Prefix-diffusion: A lightweight diffusion model for diverse image captioning. arXiv preprint arXiv:2309.04965","author":"Liu Guisheng","year":"2023","unstructured":"Guisheng Liu, Yi Li, Zhengcong Fei, Haiyan Fu, Xiangyang Luo, and Yanqing Guo. 2023. Prefix-diffusion: A lightweight diffusion model for diverse image captioning. arXiv preprint arXiv:2309.04965 (2023)."},{"key":"e_1_3_2_2_29_1","first-page":"23359","article-title":"Semantic-conditional diffusion networks for image captioning","author":"Luo Jianjie","year":"2023","unstructured":"Jianjie Luo, Yehao Li, Yingwei Pan, Ting Yao, Jianlin Feng, Hongyang Chao, and Tao Mei. 2023. Semantic-conditional diffusion networks for image captioning. In CVPR. 23359-23368.","journal-title":"CVPR."},{"key":"e_1_3_2_2_30_1","volume-title":"Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734","author":"Mokady Ron","year":"2021","unstructured":"Ron Mokady, Amir Hertz, and Amit H Bermano. 2021. Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734 (2021)."},{"key":"e_1_3_2_2_31_1","first-page":"7070","article-title":"Auto-captions on GIF: A large-scale video-sentence dataset for vision-language pre-training","author":"Pan Yingwei","year":"2022","unstructured":"Yingwei Pan, Yehao Li, Jianjie Luo, Jun Xu, Ting Yao, and Tao Mei. 2022. Auto-captions on GIF: A large-scale video-sentence dataset for vision-language pre-training. In ACM MM. 7070-7074.","journal-title":"ACM MM."},{"key":"e_1_3_2_2_32_1","first-page":"10971","article-title":"X-linear attention networks for image captioning","author":"Pan Yingwei","year":"2020","unstructured":"Yingwei Pan, Ting Yao, Yehao Li, and Tao Mei. 2020. X-linear attention networks for image captioning. In CVPR. 10971-10980.","journal-title":"CVPR."},{"key":"e_1_3_2_2_33_1","first-page":"311","article-title":"Bleu: a method for automatic evaluation of machine translation","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In ACL. 311-318.","journal-title":"ACL."},{"key":"e_1_3_2_2_34_1","volume-title":"Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952","author":"Podell Dustin","year":"2023","unstructured":"Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas M\u00fcller, Joe Penna, and Robin Rombach. 2023. Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)."},{"key":"e_1_3_2_2_35_1","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In ICML. PMLR, 8748-8763.","journal-title":"ICML. PMLR"},{"key":"e_1_3_2_2_36_1","first-page":"10684","article-title":"High-resolution image synthesis with latent diffusion models","author":"Rombach Robin","year":"2022","unstructured":"Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, and Bj\u00f6rn Ommer. 2022. High-resolution image synthesis with latent diffusion models. In CVPR. 10684-10695.","journal-title":"CVPR."},{"key":"e_1_3_2_2_37_1","volume-title":"Learning representations by back-propagating errors. nature","author":"Rumelhart David E","year":"1986","unstructured":"David E Rumelhart, Geoffrey E Hinton, and Ronald J Williams. 1986. Learning representations by back-propagating errors. nature, Vol. 323, 6088 (1986), 533-536."},{"volume-title":"Adversarial diffusion distillation","author":"Sauer Axel","key":"e_1_3_2_2_38_1","unstructured":"Axel Sauer, Dominik Lorenz, Andreas Blattmann, and Robin Rombach. 2025. Adversarial diffusion distillation. In ECCV. Springer, 87-103."},{"key":"e_1_3_2_2_39_1","volume-title":"Attention is all you need. NeurIPS","author":"Vaswani A","year":"2017","unstructured":"A Vaswani. 2017. Attention is all you need. NeurIPS (2017)."},{"key":"e_1_3_2_2_40_1","first-page":"4566","article-title":"Cider: Consensus-based image description evaluation","author":"Vedantam Ramakrishna","year":"2015","unstructured":"Ramakrishna Vedantam, C Lawrence Zitnick, and Devi Parikh. 2015. Cider: Consensus-based image description evaluation. In CVPR. 4566-4575.","journal-title":"CVPR."},{"key":"e_1_3_2_2_41_1","first-page":"3156","article-title":"Show and tell: A neural image caption generator","author":"Vinyals Oriol","year":"2015","unstructured":"Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. 2015. Show and tell: A neural image caption generator. In CVPR. 3156-3164.","journal-title":"CVPR."},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3731715.3733389"},{"key":"e_1_3_2_2_43_1","volume-title":"Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100","author":"Wang Jianfeng","year":"2022","unstructured":"Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, and Lijuan Wang. 2022. Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100 (2022)."},{"key":"e_1_3_2_2_44_1","volume-title":"LaDiC: Are Diffusion Models Really Inferior to Autoregressive Counterparts for Image-to-Text Generation? arXiv preprint arXiv:2404.10763","author":"Wang Yuchi","year":"2024","unstructured":"Yuchi Wang, Shuhuai Ren, Rundong Gao, Linli Yao, Qingyan Guo, Kaikai An, Jianhong Bai, and Xu Sun. 2024. LaDiC: Are Diffusion Models Really Inferior to Autoregressive Counterparts for Image-to-Text Generation? arXiv preprint arXiv:2404.10763 (2024)."},{"key":"e_1_3_2_2_45_1","volume-title":"Simple statistical gradient-following algorithms for connectionist reinforcement learning. Machine learning","author":"Williams Ronald J","year":"1992","unstructured":"Ronald J Williams. 1992. Simple statistical gradient-following algorithms for connectionist reinforcement learning. Machine learning, Vol. 8 (1992), 229-256."},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3036860"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00854"},{"key":"e_1_3_2_2_48_1","volume-title":"attend and tell: Neural image caption generation with visual attention. arXiv preprint arXiv:1502.03044","author":"Show Kelvin Xu.","year":"2015","unstructured":"Kelvin Xu. 2015. Show, attend and tell: Neural image caption generation with visual attention. arXiv preprint arXiv:1502.03044 (2015)."},{"key":"e_1_3_2_2_49_1","volume-title":"Clip-diffusion-lm: Apply diffusion model on image captioning. arXiv preprint arXiv:2210.04559","author":"Xu Shitong","year":"2022","unstructured":"Shitong Xu. 2022. Clip-diffusion-lm: Apply diffusion model on image captioning. arXiv preprint arXiv:2210.04559 (2022)."},{"key":"e_1_3_2_2_50_1","first-page":"2197","article-title":"Auto-parsing network for image captioning and visual question answering","author":"Yang Xu","year":"2021","unstructured":"Xu Yang, Chongyang Gao, Hanwang Zhang, and Jianfei Cai. 2021. Auto-parsing network for image captioning and visual question answering. In ICCV. 2197-2207.","journal-title":"ICCV."},{"key":"e_1_3_2_2_51_1","first-page":"10685","article-title":"Auto-encoding scene graphs for image captioning","author":"Yang Xu","year":"2019","unstructured":"Xu Yang, Kaihua Tang, Hanwang Zhang, and Jianfei Cai. 2019. Auto-encoding scene graphs for image captioning. In CVPR. 10685-10694.","journal-title":"CVPR."},{"key":"e_1_3_2_2_52_1","first-page":"6580","article-title":"Incorporating copying mechanism in image captioning for learning novel objects","author":"Yao Ting","year":"2017","unstructured":"Ting Yao, Yingwei Pan, Yehao Li, and Tao Mei. 2017a. Incorporating copying mechanism in image captioning for learning novel objects. In CVPR. 6580-6588.","journal-title":"CVPR."},{"key":"e_1_3_2_2_53_1","first-page":"684","article-title":"Exploring visual relationship for image captioning","author":"Yao Ting","year":"2018","unstructured":"Ting Yao, Yingwei Pan, Yehao Li, and Tao Mei. 2018. Exploring visual relationship for image captioning. In ECCV. 684-699.","journal-title":"ECCV."},{"key":"e_1_3_2_2_54_1","first-page":"4894","article-title":"Boosting image captioning with attributes","author":"Yao Ting","year":"2017","unstructured":"Ting Yao, Yingwei Pan, Yehao Li, Zhaofan Qiu, and Tao Mei. 2017b. Boosting image captioning with attributes. In ICCV. 4894-4902.","journal-title":"ICCV."},{"key":"e_1_3_2_2_55_1","first-page":"4651","article-title":"Image captioning with semantic attention","author":"You Quanzeng","year":"2016","unstructured":"Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. 2016. Image captioning with semantic attention. In CVPR. 4651-4659.","journal-title":"CVPR."},{"key":"e_1_3_2_2_56_1","volume-title":"Seqdiffuseq: Text diffusion with encoder-decoder transformers. arXiv preprint arXiv:2212.10325","author":"Yuan Hongyi","year":"2022","unstructured":"Hongyi Yuan, Zheng Yuan, Chuanqi Tan, Fei Huang, and Songfang Huang. 2022. Seqdiffuseq: Text diffusion with encoder-decoder transformers. arXiv preprint arXiv:2212.10325 (2022)."},{"key":"e_1_3_2_2_57_1","first-page":"5579","article-title":"Vinvl: Revisiting visual representations in vision-language models","author":"Zhang Pengchuan","year":"2021","unstructured":"Pengchuan Zhang, Xiujun Li, Xiaowei Hu, Jianwei Yang, Lei Zhang, Lijuan Wang, Yejin Choi, and Jianfeng Gao. 2021. Vinvl: Revisiting visual representations in vision-language models. In CVPR. 5579-5588.","journal-title":"CVPR."},{"key":"e_1_3_2_2_58_1","volume-title":"Bertscore: Evaluating text generation with bert. arXiv preprint arXiv:1904.09675","author":"Zhang Tianyi","year":"2019","unstructured":"Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian Q Weinberger, and Yoav Artzi. 2019. Bertscore: Evaluating text generation with bert. arXiv preprint arXiv:1904.09675 (2019)."},{"key":"e_1_3_2_2_59_1","first-page":"3139","article-title":"Semi-autoregressive transformer for image captioning","author":"Zhou Yuanen","year":"2021","unstructured":"Yuanen Zhou, Yong Zhang, Zhenzhen Hu, and Meng Wang. 2021. Semi-autoregressive transformer for image captioning. In ICCV. 3139-3143.","journal-title":"ICCV."},{"key":"e_1_3_2_2_60_1","volume-title":"Exploring discrete diffusion models for image captioning. arXiv preprint arXiv:2211.11694","author":"Zhu Zixin","year":"2022","unstructured":"Zixin Zhu, Yixuan Wei, Jianfeng Wang, Zhe Gan, Zheng Zhang, Le Wang, Gang Hua, Lijuan Wang, Zicheng Liu, and Han Hu. 2022. Exploring discrete diffusion models for image captioning. arXiv preprint arXiv:2211.11694 (2022)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755156","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:59:00Z","timestamp":1765310340000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755156"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":60,"alternative-id":["10.1145\/3746027.3755156","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755156","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}