{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T02:37:20Z","timestamp":1774579040352,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U21B2024"],"award-info":[{"award-number":["U21B2024"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Postdoctoral Fellowship Program of CPSF","award":["GZC20231918"],"award-info":[{"award-number":["GZC20231918"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681497","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"9369-9377","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Cross-Modal Coherence-Enhanced Feedback Prompting for News Captioning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7526-4356","authenticated-orcid":false,"given":"Ning","family":"Xu","sequence":"first","affiliation":[{"name":"School of Electrical and Information Engineering, Tianjin University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0436-5930","authenticated-orcid":false,"given":"Yifei","family":"Gao","sequence":"additional","affiliation":[{"name":"School of Electrical and Information Engineering, Tianjin University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7923-2693","authenticated-orcid":false,"given":"Ting-Ting","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Electrical and Information Engineering, Tianjin University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7635-0961","authenticated-orcid":false,"given":"Hongshuo","family":"Tian","sequence":"additional","affiliation":[{"name":"School of Electrical and Information Engineering, Tianjin University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5755-9145","authenticated-orcid":false,"given":"An-An","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Electrical and Information Engineering, Tianjin University, Tianjin, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In ACLWorkshop on MT. 65--72.","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In ACLWorkshop on MT. 65--72."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"crossref","unstructured":"Ali Furkan Biten Llu\u00eds G\u00f3mez Mar\u00e7al Rusi\u00f1ol and Dimosthenis Karatzas. 2019. Good News Everyone! Context Driven Entity-Aware Captioning for News Images. In CVPR. 12466--12475.","DOI":"10.1109\/CVPR.2019.01275"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"crossref","unstructured":"Nenglun Chen Xingjia Pan Runnan Chen Lei Yang Zhiwen Lin Yuqiang Ren Haolei Yuan Xiaowei Guo Feiyue Huang and Wenping Wang. 2021. Distributed Attention for Grounded Image Captioning. In ACM MM. 1966--1975.","DOI":"10.1145\/3474085.3475354"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"crossref","unstructured":"Sanjoy Chowdhury Sayan Nag and Dinesh Manocha. 2023. APoLLo: Unified Adapter and Prompt Learning for Vision Language Models. In EMNLP. 10173--10187.","DOI":"10.18653\/v1\/2023.emnlp-main.629"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"crossref","unstructured":"Junjie Fei Teng Wang Jinrui Zhang Zhenyu He Chengjie Wang and Feng Zheng. 2023. Transferable Decoding with Visual Entities for Zero-Shot Image Captioning. In ICCV. 3113--3123.","DOI":"10.1109\/ICCV51070.2023.00291"},{"key":"e_1_3_2_2_6_1","volume-title":"ACL Workshop.","author":"Flick Carlos","year":"2018","unstructured":"Carlos Flick. 2018. Rouge: A package for automatic evaluation of summaries. In ACL Workshop."},{"key":"e_1_3_2_2_7_1","volume-title":"Multi-graph Fusion for Dynamic Graph Convolutional Network","author":"Gan Jiangzhang","year":"2022","unstructured":"Jiangzhang Gan, Rongyao Hu, Yujie Mo, Zhao Kang, Liang Peng, Yonghua Zhu, and Xiaofeng Zhu. 2022. Multi-graph Fusion for Dynamic Graph Convolutional Network. IEEE Transactions on Neural Networks and Learning Systems (2022), 10.1109\/TNNLS.2022.3172588."},{"key":"e_1_3_2_2_8_1","volume-title":"LAMM: Label Alignment for Multi-Modal Prompt Learning. In AAAI, Michael J. Wooldridge, Jennifer G. Dy, and Sriraam Natarajan (Eds.).","author":"Gao Jingsheng","year":"2024","unstructured":"Jingsheng Gao, Jiacheng Ruan, Suncheng Xiang, Zefang Yu, Ke Ji, Mingye Xie, Ting Liu, and Yuzhuo Fu. 2024. LAMM: Label Alignment for Multi-Modal Prompt Learning. In AAAI, Michael J. Wooldridge, Jennifer G. Dy, and Sriraam Natarajan (Eds.)."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"crossref","unstructured":"Mehrdad Hosseinzadeh and Yang Wang. 2021. Image Change Captioning by Learning From an Auxiliary Task. In CVPR. 2725--2734.","DOI":"10.1109\/CVPR46437.2021.00275"},{"key":"e_1_3_2_2_10_1","volume-title":"ICECAP: Information Concentrated Entity-aware Image Captioning. In ACMMM. 4217--4225.","author":"Hu Anwen","year":"2020","unstructured":"Anwen Hu, Shizhe Chen, and Qin Jin. 2020. ICECAP: Information Concentrated Entity-aware Image Captioning. In ACMMM. 4217--4225."},{"key":"e_1_3_2_2_11_1","unstructured":"Edward J. Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang and Weizhu Chen. 2022. LoRA: Low-Rank Adaptation of Large Language Models. In ICLR."},{"key":"e_1_3_2_2_12_1","volume-title":"Yu","author":"Hu Xuming","year":"2023","unstructured":"Xuming Hu, Junzhe Chen, Aiwei Liu, Shiao Meng, Lijie Wen, and Philip S. Yu. 2023. Prompt Me Up: Unleashing the Power of Alignments for Multimodal Entity and Relation Extraction. In ACM MM. 5185--5194."},{"key":"e_1_3_2_2_13_1","volume-title":"Muhammad Maaz, Salman H. Khan, and Fahad Shahbaz Khan.","author":"Khattak Muhammad Uzair","year":"2023","unstructured":"Muhammad Uzair Khattak, Hanoona Abdul Rasheed, Muhammad Maaz, Salman H. Khan, and Fahad Shahbaz Khan. 2023. MaPLe: Multi-modal Prompt Learning. In CVPR. 19113--19122."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"crossref","unstructured":"Simon Kornblith Lala Li Zirui Wang and Thao Nguyen. 2023. Guiding image captioning models toward more specific captions. In ICCV. 15213--15223.","DOI":"10.1109\/ICCV51070.2023.01400"},{"key":"e_1_3_2_2_15_1","volume-title":"HAAV: Hierarchical Aggregation of Augmented Views for Image Captioning. In CVPR. 11039--11049.","author":"Kuo Chia-Wen","year":"2023","unstructured":"Chia-Wen Kuo and Zsolt Kira. 2023. HAAV: Hierarchical Aggregation of Augmented Views for Image Captioning. In CVPR. 11039--11049."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"crossref","unstructured":"Guodun Li Yuchen Zhai Zehao Lin and Yin Zhang. 2021. Similar Scenes Arouse Similar Emotions: Parallel Data Augmentation for Stylized Image Captioning. In ACMMM. 5363--5372.","DOI":"10.1145\/3474085.3475662"},{"key":"e_1_3_2_2_17_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong Jae Lee. 2023. Visual Instruction Tuning. In NeurIPS."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"crossref","unstructured":"Yunpeng Luo Jiayi Ji Xiaoshuai Sun Liujuan Cao Yongjian Wu Feiyue Huang Chia-Wen Lin and Rongrong Ji. 2021. Dual-level Collaborative Transformer for Image Captioning. In AAAI. 2286--2293.","DOI":"10.1609\/aaai.v35i3.16328"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2023.3268069"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"crossref","unstructured":"Kishore Papineni Salim Roukos Todd Ward and Wei-Jing Zhu. 2002. Bleu: a Method for Automatic Evaluation of Machine Translation. In USA. 311--318.","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_2_21_1","volume-title":"PyTorch: An Imperative Style","author":"Paszke Adam","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, and Gregory Chanan. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In NeurIPS. 8024--8035."},{"key":"e_1_3_2_2_22_1","volume-title":"Heng Tao Shen, and Xiaofeng Zhu. [n. d.]. GRLC: Graph Representation Learning with Constraints","author":"Peng Liang","year":"2022","unstructured":"Liang Peng, Yujie Mo, Jie Xu, Jialie Shen, Xiaoshuang Shi, Xiaoxiao Li, Heng Tao Shen, and Xiaofeng Zhu. [n. d.]. GRLC: Graph Representation Learning with Constraints. IEEE transactions on neural networks and learning systems ([n. d.]), 10.1109\/TNNLS.2022.3230979."},{"key":"e_1_3_2_2_23_1","volume-title":"Visually-Aware Context Modeling for News Image Captioning. CoRR abs\/2308.08325","author":"Qu Tingyu","year":"2023","unstructured":"Tingyu Qu, Tinne Tuytelaars, and Marie-Francine Moens. 2023. Visually-Aware Context Modeling for News Image Captioning. CoRR abs\/2308.08325 (2023)."},{"key":"e_1_3_2_2_24_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In ICML, Marina Meila and Tong Zhang (Eds.), Vol. 139. 8748--8763."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2721945"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"crossref","unstructured":"Jeff Rasley Samyam Rajbhandari Olatunji Ruwase and Yuxiong He. 2020. Deep-Speed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters. In ACMSIGKDD. 3505--3506.","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_2_27_1","volume-title":"Prompting Large Language Models with Answer Heuristics for Knowledge-Based Visual Question Answering","author":"Shao Zhenwei","unstructured":"Zhenwei Shao, Zhou Yu, Meng Wang, and Jun Yu. 2023. Prompting Large Language Models with Answer Heuristics for Knowledge-Based Visual Question Answering. In CVPR. IEEE, 14974--14983."},{"key":"e_1_3_2_2_28_1","unstructured":"Kaitao Song Xu Tan Tao Qin Jianfeng Lu and Tie-Yan Liu. 2020. MPNet: Masked and Permuted Pre-training for Language Understanding. In NeurIPS."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"crossref","unstructured":"Hongbo Sun Xiangteng He Jiahuan Zhou and Yuxin Peng. 2023. Fine-Grained Visual Prompt Learning of Vision-Language Models for Image Recognition. In ACM MM. 5828--5836.","DOI":"10.1145\/3581783.3612403"},{"key":"e_1_3_2_2_30_1","volume-title":"LLaMA: Open and Efficient Foundation Language Models. CoRR abs\/2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aur\u00e9lien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. CoRR abs\/2302.13971 (2023)."},{"key":"e_1_3_2_2_31_1","volume-title":"Alexander Patrick Mathews, and Lexing Xie","author":"Tran Alasdair","year":"2020","unstructured":"Alasdair Tran, Alexander Patrick Mathews, and Lexing Xie. 2020. Transform and Tell: Entity-Aware News Image Captioning. In CVPR. 13032--13042."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"crossref","unstructured":"Ramakrishna Vedantam C. Lawrence Zitnick and Devi Parikh. 2015. CIDEr: Consensus-based image description evaluation. In CVPR. 4566--4575.","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"crossref","unstructured":"Bo Wang Zhao Zhang Suiyi Zhao Haijun Zhang Richang Hong and Meng Wang. 2023. CropCap: Embedding Visual Cross-Partition Dependency for Image Captioning. In ACM MM. 1750--1758.","DOI":"10.1145\/3581783.3612245"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"crossref","unstructured":"Ting Wang Weidong Chen Yuanhe Tian Yan Song and Zhendong Mao. 2023. Improving Image Captioning via Predicting Structured Concepts. In EMNLP. 360--370.","DOI":"10.18653\/v1\/2023.emnlp-main.25"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"crossref","unstructured":"Yong Wang Wenkai Zhang Qing Liu Zhengyuan Zhang Xin Gao and Xian Sun. 2020. Improving Intra- and Inter-Modality Visual Relation for Image Captioning. In ACMMM. 4190--4198.","DOI":"10.1145\/3394171.3413877"},{"key":"e_1_3_2_2_36_1","unstructured":"Zhen Wang Rameswar Panda Leonid Karlinsky Rog\u00e9rio Feris Huan Sun and Yoon Kim. 2023. Multitask Prompt Tuning Enables Parameter-Efficient Transfer Learning. In ICLR. OpenReview.net."},{"key":"e_1_3_2_2_37_1","volume-title":"Mike Li, Simon Kornblith, Rebecca Roelofs, Raphael Gontijo Lopes, Hannaneh Hajishirzi, Ali Farhadi, Hongseok Namkoong, and Ludwig Schmidt.","author":"Wortsman Mitchell","year":"2022","unstructured":"Mitchell Wortsman, Gabriel Ilharco, Jong Wook Kim, Mike Li, Simon Kornblith, Rebecca Roelofs, Raphael Gontijo Lopes, Hannaneh Hajishirzi, Ali Farhadi, Hongseok Namkoong, and Ludwig Schmidt. 2022. Robust fine-tuning of zero-shot models. In CVPR."},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"crossref","unstructured":"Xuewen Yang Svebor Karaman Joel R. Tetreault and Alejandro Jaimes. 2021. Journalistic Guidelines Aware News Image Captioning. In EMNLP. 5162--5175.","DOI":"10.18653\/v1\/2021.emnlp-main.419"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"crossref","unstructured":"Jiarui Yu Haoran Li Yanbin Hao Bin Zhu Tong Xu and Xiangnan He. 2023. CgT-GAN: CLIP-guided Text GAN for Image Captioning. In ACMMM. 2252--2263.","DOI":"10.1145\/3581783.3611891"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2909864"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"crossref","unstructured":"Yajing Zhai Yawen Zeng Zhiyong Huang Zheng Qin Xin Jin and Da Cao. 2024. Multi-Prompts Learning with Cross-Modal Alignment for Attribute-Based Person Re-identification. In AAAI. 6979--6987.","DOI":"10.1609\/aaai.v38i7.28524"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"crossref","unstructured":"Jingjing Zhang Shancheng Fang Zhendong Mao Zhiwei Zhang and Yongdong Zhang. 2022. Fine-tuning with Multi-modal Entity Prompts for News Image Captioning. In ACMMM. 4365--4373.","DOI":"10.1145\/3503161.3547883"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"crossref","unstructured":"Jing Zhang Yingshuai Xie and Xiaoqiang Liu. 2023. Improving Image Captioning through Visual and Semantic Mutual Promotion. In ACM MM. 4716--4724.","DOI":"10.1145\/3581783.3612480"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"crossref","unstructured":"Yi Zhang Ce Zhang Ke Yu Yushun Tang and Zhihai He. 2024. Concept-Guided Prompt Learning for Generalization in Vision-Language Models. In AAAI. 7377--7386.","DOI":"10.1609\/aaai.v38i7.28568"},{"key":"e_1_3_2_2_45_1","volume-title":"Chen Change Loy, and Ziwei Liu","author":"Zhou Kaiyang","year":"2022","unstructured":"Kaiyang Zhou, Jingkang Yang, Chen Change Loy, and Ziwei Liu. 2022. Conditional Prompt Learning for Vision-Language Models. In CVPR."},{"key":"e_1_3_2_2_46_1","volume-title":"Chen Change Loy, and Ziwei Liu","author":"Zhou Kaiyang","year":"2022","unstructured":"Kaiyang Zhou, Jingkang Yang, Chen Change Loy, and Ziwei Liu. 2022. Learning to Prompt for Vision-Language Models. Int. J. Comput. Vis. (2022)."},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"crossref","unstructured":"Mingyang Zhou Grace Luo Anna Rohrbach and Zhou Yu. 2022. Focus! Relevant and Sufficient Context Selection for News Image Captioning. In EMNLP. 6078--6088.","DOI":"10.18653\/v1\/2022.findings-emnlp.450"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"crossref","unstructured":"Yuanen Zhou Yong Zhang Zhenzhen Hu and Meng Wang. 2021. Semi-Autoregressive Transformer for Image Captioning. In ICCVW. 3132--3136.","DOI":"10.1109\/ICCVW54120.2021.00350"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2021.07.013"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681497","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681497","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:48Z","timestamp":1750294668000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681497"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":49,"alternative-id":["10.1145\/3664647.3681497","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681497","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}