{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T18:35:37Z","timestamp":1771698937865,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":75,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2020YFB1406703"],"award-info":[{"award-number":["2020YFB1406703"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611891","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"2252-2263","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":27,"title":["CgT-GAN: CLIP-guided Text GAN for Image Captioning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6327-6811","authenticated-orcid":false,"given":"Jiarui","family":"Yu","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3593-7169","authenticated-orcid":false,"given":"Haoran","family":"Li","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0695-1566","authenticated-orcid":false,"given":"Yanbin","family":"Hao","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9213-2611","authenticated-orcid":false,"given":"Bin","family":"Zhu","sequence":"additional","affiliation":[{"name":"Singapore Management University, Bras Basah, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4246-5386","authenticated-orcid":false,"given":"Tong","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8472-7992","authenticated-orcid":false,"given":"Xiangnan","family":"He","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Spice: Semantic propositional image caption evaluation. In ECCV. 382--398.","author":"Anderson Peter","year":"2016","unstructured":"Peter Anderson, Basura Fernando, Mark Johnson, and Stephen Gould. 2016. Spice: Semantic propositional image caption evaluation. In ECCV. 382--398."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Peter Anderson Xiaodong He Chris Buehler Damien Teney Mark Johnson Stephen Gould and Lei Zhang. 2018. Bottom-up and top-down attention for image captioning and visual question answering. In CVPR. 6077--6086.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_3_1","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization. 65--72."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00512"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3060948"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.08.019"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Chen Chen Shuai Mu Wanpeng Xiao Zexiong Ye Liesi Wu and Qi Ju. 2019. Improving image captioning with conditional generative adversarial nets. In AAAI. 8142--8150.","DOI":"10.1609\/aaai.v33i01.33018142"},{"key":"e_1_3_2_1_8_1","volume-title":"Sca-cnn: Spatial and channel-wise attention in convolutional networks for image captioning. In CVPR. 5659--5667.","author":"Chen Long","year":"2017","unstructured":"Long Chen, Hanwang Zhang, Jun Xiao, Liqiang Nie, Jian Shao, Wei Liu, and Tat-Seng Chua. 2017. Sca-cnn: Spatial and channel-wise attention in convolutional networks for image captioning. In CVPR. 5659--5667."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Jaemin Cho Seunghyun Yoon Ajinkya Kale Franck Dernoncourt Trung Bui and Mohit Bansal. 2022. Fine-grained image captioning with clip reward. In Findings of NAACL. 517--527.","DOI":"10.18653\/v1\/2022.findings-naacl.39"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Bo Dai Sanja Fidler Raquel Urtasun and Dahua Lin. 2017. Towards diverse and natural image descriptions via a conditional gan. In ICCV. 2970--2979.","DOI":"10.1109\/ICCV.2017.323"},{"key":"e_1_3_2_1_11_1","volume-title":"Words: Transformers for Image Recognition at Scale. In ICLR.","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, et al. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In ICLR."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Yang Feng Lin Ma Wei Liu and Jiebo Luo. 2019. Unsupervised image captioning. In CVPR. 4125--4134.","DOI":"10.1109\/CVPR.2019.00425"},{"key":"e_1_3_2_1_13_1","volume-title":"UNISON: Unpaired Cross-Lingual Image Captioning. In AAAI. 10654--10662.","author":"Gao Jiahui","year":"2022","unstructured":"Jiahui Gao, Yi Zhou, LH Philip, Shafiq Joty, and Jiuxiang Gu. 2022. UNISON: Unpaired Cross-Lingual Image Captioning. In AAAI. 10654--10662."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Jiuxiang Gu Shafiq Joty Jianfei Cai and Gang Wang. 2018. Unpaired image captioning by language pivoting. In ECCV. 503--519.","DOI":"10.1007\/978-3-030-01246-5_31"},{"key":"e_1_3_2_1_16_1","unstructured":"Jiuxiang Gu Shafiq Joty Jianfei Cai Handong Zhao Xu Yang and Gang Wang. 2019. Unpaired image captioning via scene graph alignments. In ICCV. 10323--10332."},{"key":"e_1_3_2_1_17_1","volume-title":"I Can't Believe There's No Images! Learning Visual Tasks Using only Language Data. arXiv preprint arXiv:2211.09778","author":"Gu Sophia","year":"2022","unstructured":"Sophia Gu, Christopher Clark, and Aniruddha Kembhavi. 2022. I Can't Believe There's No Images! Learning Visual Tasks Using only Language Data. arXiv preprint arXiv:2211.09778 (2022)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Dan Guo Yang Wang Peipei Song and Meng Wang. 2021. Recurrent relational memory network for unsupervised image captioning. In IJCAI. 920--926.","DOI":"10.24963\/ijcai.2020\/128"},{"key":"e_1_3_2_1_19_1","volume-title":"Ronan Le Bras, and Yejin Choi","author":"Hessel Jack","year":"2021","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan Le Bras, and Yejin Choi. 2021. CLIPScore: A Reference-free Evaluation Metric for Image Captioning. In EMNLP. 7514--7528."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Ukyo Honda Yoshitaka Ushiku Atsushi Hashimoto Taro Watanabe and Yuji Matsumoto. 2021. Removing Word-Level Spurious Alignment between Images and Pseudo-Captions in Unsupervised Image Captioning. In EACL. 3692--3702.","DOI":"10.18653\/v1\/2021.eacl-main.323"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Jonathan Huang Vivek Rathod Chen Sun Menglong Zhu Anoop Korattikara Alireza Fathi Ian Fischer Zbigniew Wojna Yang Song Sergio Guadarrama et al. 2017. Speed\/accuracy trade-offs for modern convolutional object detectors. In CVPR. 7310--7311.","DOI":"10.1109\/CVPR.2017.351"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Ajay Jain Ben Mildenhall Jonathan T Barron Pieter Abbeel and Ben Poole. 2022. Zero-shot text-guided object generation with dream fields. In CVPR. 867--876.","DOI":"10.1109\/CVPR52688.2022.00094"},{"key":"e_1_3_2_1_23_1","unstructured":"Chao Jia Yinfei Yang Ye Xia Yi-Ting Chen Zarana Parekh Hieu Pham Quoc Le Yun-Hsuan Sung Zhen Li and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In ICML. 4904--4916."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Chen Ju Tengda Han Kunhao Zheng Ya Zhang and Weidi Xie. 2022. Prompting Visual-Language Models for Efficient Video Understanding. In ECCV. 105--124.","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Andrej Karpathy and Li Fei-Fei. 2015. Deep visual-semantic alignments for generating image descriptions. In CVPR. 3128--3137.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Iro Laina Christian Rupprecht and Nassir Navab. 2019. Towards unsupervised image captioning with shared multimodal embeddings. In ICCV. 7414--7424.","DOI":"10.1109\/ICCV.2019.00751"},{"key":"e_1_3_2_1_27_1","unstructured":"Wei Li Linchao Zhu Longyin Wen and Yi Yang. 2023. DeCap: Decoding CLIP Latents for Zero-Shot Captioning via Text-Only Training. In ICLR."},{"key":"e_1_3_2_1_28_1","unstructured":"Yehao Li Yingwei Pan Ting Yao and Tao Mei. 2022. Comprehending and ordering semantics for image captioning. In CVPR. 17990--17999."},{"key":"e_1_3_2_1_29_1","unstructured":"Victor Weixin Liang Yuhui Zhang Yongchan Kwon Serena Yeung and James Y Zou. 2022. Mind the gap: Understanding the modality gap in multi-modal contrastive representation learning. In NeurIPS. 17612--17625."},{"key":"e_1_3_2_1_30_1","volume-title":"ROUGE: A Package for Automatic Evaluation of Summaries. In ACL. 74--81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. ROUGE: A Package for Automatic Evaluation of Summaries. In ACL. 74--81."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Tsung-Yi Lin Michael Maire Serge Belongie James Hays Pietro Perona Deva Ramanan Piotr Doll\u00e1r and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In ECCV. 740--755.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Fenglin Liu Meng Gao Tianhao Zhang and Yuexian Zou. 2019a. Exploring semantic relationships for image captioning without parallel data. In ICDM. 439--448.","DOI":"10.1109\/ICDM.2019.00054"},{"key":"e_1_3_2_1_33_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019b. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Zhenguang Liu Kedi Lyu Shuang Wu Haipeng Chen Yanbin Hao and Shouling Ji. 2021a. Aggregated multi-gans for controlled 3d human motion prediction. In AAAI. 2225--2232.","DOI":"10.1609\/aaai.v35i3.16321"},{"key":"e_1_3_2_1_35_1","unstructured":"Zhenguang Liu Pengxiang Su Shuang Wu Xuanjing Shen Haipeng Chen Yanbin Hao and Meng Wang. 2021b. Motion prediction using trajectory cues. In ICCV. 13299--13308."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3139918"},{"key":"e_1_3_2_1_37_1","unstructured":"Ilya Loshchilov and Frank Hutter. 2018. Decoupled Weight Decay Regularization. In ICLR."},{"key":"e_1_3_2_1_38_1","unstructured":"Jiasen Lu Caiming Xiong Devi Parikh and Richard Socher. 2017. Knowing when to look: Adaptive attention via a visual sentinel for image captioning. In CVPR. 375--383."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Zihang Meng David Yang Xuefei Cao Ashish Shah and Ser-Nam Lim. 2022. Object-Centric Unsupervised Image Captioning. In ECCV. 219--235.","DOI":"10.1007\/978-3-031-20059-5_13"},{"key":"e_1_3_2_1_40_1","volume-title":"Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734","author":"Mokady Ron","year":"2021","unstructured":"Ron Mokady, Amir Hertz, and Amit H Bermano. 2021. Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734 (2021)."},{"key":"e_1_3_2_1_41_1","unstructured":"Medhini Narasimhan Anna Rohrbach and Trevor Darrell. 2021. CLIP-It! language-guided video summarization. In NeurIPS. 13988--14000."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"David Nukrai Ron Mokady and Amir Globerson. 2022. Text-Only Training for Image Captioning using Noise-Injected CLIP. In EMNLP findings. 4055--4063.","DOI":"10.18653\/v1\/2022.findings-emnlp.299"},{"key":"e_1_3_2_1_43_1","unstructured":"Yingwei Pan Ting Yao Yehao Li and Tao Mei. 2020. X-linear attention networks for image captioning. In CVPR. 10971--10980."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Kishore Papineni Salim Roukos Todd Ward and Wei-Jing Zhu. 2002. Bleu: a Method for Automatic Evaluation of Machine Translation. In ACL. 311--318.","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_1_45_1","volume-title":"Styleclip: Text-driven manipulation of stylegan imagery. In ICCV. 2085--2094.","author":"Patashnik Or","year":"2021","unstructured":"Or Patashnik, Zongze Wu, Eli Shechtman, Daniel Cohen-Or, and Dani Lischinski. 2021. Styleclip: Text-driven manipulation of stylegan imagery. In ICCV. 2085--2094."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Bryan A Plummer Liwei Wang Chris M Cervantes Juan C Caicedo Julia Hockenmaier and Svetlana Lazebnik. 2015. Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In ICCV. 2641--2649.","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_47_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML. 8748--8763."},{"key":"e_1_3_2_1_48_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9."},{"key":"e_1_3_2_1_49_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Steven J Rennie Etienne Marcheret Youssef Mroueh Jerret Ross and Vaibhava Goel. 2017. Self-critical sequence training for image captioning. In CVPR. 7008--7024.","DOI":"10.1109\/CVPR.2017.131"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Piyush Sharma Nan Ding Sebastian Goodman and Radu Soricut. 2018. Conceptual captions: A cleaned hypernymed image alt-text dataset for automatic image captioning. In ACL. 2556--2565.","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Haoyu Song Li Dong Weinan Zhang Ting Liu and Furu Wei. 2022a. CLIP Models are Few-Shot Learners: Empirical Studies on VQA and Visual Entailment. In ACL. 6088--6100.","DOI":"10.18653\/v1\/2022.acl-long.421"},{"key":"e_1_3_2_1_53_1","volume-title":"Memorial GAN With Joint Semantic Optimization for Unpaired Image Captioning. TCyber","author":"Song Peipei","year":"2022","unstructured":"Peipei Song, Dan Guo, Jinxing Zhou, Mingliang Xu, and Meng Wang. 2022b. Memorial GAN With Joint Semantic Optimization for Unpaired Image Captioning. TCyber (2022)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Yuqing Song Shizhe Chen Yida Zhao and Qin Jin. 2019. Unpaired cross-lingual image caption generation with self-supervised rewards. In ACM MM. 784--792.","DOI":"10.1145\/3343031.3350996"},{"key":"e_1_3_2_1_55_1","volume-title":"Language models can see: plugging visual controls in text generation. arXiv preprint arXiv:2205.02655","author":"Su Yixuan","year":"2022","unstructured":"Yixuan Su, Tian Lan, Yahui Liu, Fangyu Liu, Dani Yogatama, Yan Wang, Lingpeng Kong, and Nigel Collier. 2022. Language models can see: plugging visual controls in text generation. arXiv preprint arXiv:2205.02655 (2022)."},{"key":"e_1_3_2_1_56_1","volume-title":"Reinforcement learning: An introduction","author":"Sutton Richard S","unstructured":"Richard S Sutton and Andrew G Barto. 2018. Reinforcement learning: An introduction. MIT press."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"Mingkang Tang Zhanyu Wang Zhenhua Liu Fengyun Rao Dian Li and Xiu Li. 2021. Clip4caption: Clip for video caption. In ACM MM. 4858--4862.","DOI":"10.1145\/3474085.3479207"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"crossref","unstructured":"Yoad Tewel Yoav Shalev Idan Schwartz and Lior Wolf. 2022. ZeroCap: Zero-Shot Image-to-Text Generation for Visual-Semantic Arithmetic. In CVPR. 17918--17928.","DOI":"10.1109\/CVPR52688.2022.01739"},{"key":"e_1_3_2_1_59_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In NeurIPS. 5998--6008."},{"key":"e_1_3_2_1_60_1","volume-title":"Cider: Consensus-based image description evaluation. In CVPR. 4566--4575.","author":"Vedantam Ramakrishna","year":"2015","unstructured":"Ramakrishna Vedantam, C Lawrence Zitnick, and Devi Parikh. 2015. Cider: Consensus-based image description evaluation. In CVPR. 4566--4575."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2587640"},{"key":"e_1_3_2_1_62_1","volume-title":"Generative recommendation: Towards next-generation recommender paradigm. arXiv preprint arXiv:2304.03516","author":"Wang Wenjie","year":"2023","unstructured":"Wenjie Wang, Xinyu Lin, Fuli Feng, Xiangnan He, and Tat-Seng Chua. 2023. Generative recommendation: Towards next-generation recommender paradigm. arXiv preprint arXiv:2304.03516 (2023)."},{"key":"e_1_3_2_1_63_1","volume-title":"Cris: Clip-driven referring image segmentation. In CVPR. 11686--11695.","author":"Wang Zhaoqing","year":"2022","unstructured":"Zhaoqing Wang, Yu Lu, Qiang Li, Xunqiang Tao, Yandong Guo, Mingming Gong, and Tongliang Liu. 2022a. Cris: Clip-driven referring image segmentation. In CVPR. 11686--11695."},{"key":"e_1_3_2_1_64_1","volume-title":"Zihang Dai, Yulia Tsvetkov, and Yuan Cao.","author":"Wang Zirui","year":"2022","unstructured":"Zirui Wang, Jiahui Yu, Adams Wei Yu, Zihang Dai, Yulia Tsvetkov, and Yuan Cao. 2022b. SimVLM: Simple Visual Language Model Pretraining with Weak Supervision. In ICLR."},{"key":"e_1_3_2_1_65_1","volume-title":"Msr-vtt: A large video description dataset for bridging video and language. In CVPR. 5288--5296.","author":"Xu Jun","year":"2016","unstructured":"Jun Xu, Tao Mei, Ting Yao, and Yong Rui. 2016. Msr-vtt: A large video description dataset for bridging video and language. In CVPR. 5288--5296."},{"key":"e_1_3_2_1_66_1","unstructured":"Kelvin Xu Jimmy Ba Ryan Kiros Kyunghyun Cho Aaron Courville Ruslan Salakhudinov Rich Zemel and Yoshua Bengio. 2015. Show attend and tell: Neural image caption generation with visual attention. In ICML. 2048--2057."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"crossref","unstructured":"Ting Yao Yingwei Pan Yehao Li and Tao Mei. 2018. Exploring visual relationship for image captioning. In ECCV. 684--699.","DOI":"10.1007\/978-3-030-01264-9_42"},{"key":"e_1_3_2_1_68_1","volume-title":"Seqgan: Sequence generative adversarial nets with policy gradient. In AAAI.","author":"Yu Lantao","year":"2017","unstructured":"Lantao Yu, Weinan Zhang, Jun Wang, and Yong Yu. 2017. Seqgan: Sequence generative adversarial nets with policy gradient. In AAAI."},{"key":"e_1_3_2_1_69_1","volume-title":"Gunhee Kim, et al.","author":"Yu Youngjae","year":"2022","unstructured":"Youngjae Yu, Jiwan Chung, Heeseung Yun, Jack Hessel, JaeSung Park, Ximing Lu, Prithviraj Ammanabrolu, Rowan Zellers, Ronan Le Bras, Gunhee Kim, et al. 2022. Multimodal Knowledge Alignment with Reinforcement Learning. arXiv preprint arXiv:2205.12630 (2022)."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"crossref","unstructured":"Rowan Zellers Mark Yatskar Sam Thomson and Yejin Choi. 2018. Neural motifs: Scene graph parsing with global context. In CVPR. 5831--5840.","DOI":"10.1109\/CVPR.2018.00611"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"crossref","unstructured":"Yucheng Zhou Wei Tao and Wenqiang Zhang. 2021. Triple sequence generative adversarial nets for unsupervised image captioning. In ICASSP. 7598--7602.","DOI":"10.1109\/ICASSP39728.2021.9414335"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"crossref","unstructured":"Bin Zhu and Chong-Wah Ngo. 2020. CookGAN: Causality based text-to-image synthesis. In CVPR. 5519--5527.","DOI":"10.1109\/CVPR42600.2020.00556"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"crossref","unstructured":"Bin Zhu Chong-Wah Ngo Jingjing Chen and Yanbin Hao. 2019. R2gan: Cross-modal recipe retrieval with generative adversarial network. In CVPR. 11477--11486.","DOI":"10.1109\/CVPR.2019.01174"},{"key":"e_1_3_2_1_74_1","volume-title":"Unpaired Image Captioning by Image-level Weakly-Supervised Visual Concept Recognition. TMM","author":"Zhu Peipei","year":"2022","unstructured":"Peipei Zhu, Xiao Wang, Yong Luo, Zhenglong Sun, Wei-Shi Zheng, Yaowei Wang, and Changwen Chen. 2022. Unpaired Image Captioning by Image-level Weakly-Supervised Visual Concept Recognition. TMM (2022), 1--15."},{"key":"e_1_3_2_1_75_1","volume-title":"Prompt-based learning for unpaired image captioning. TMM","author":"Zhu Peipei","year":"2023","unstructured":"Peipei Zhu, Xiao Wang, Lin Zhu, Zhenglong Sun, Wei-Shi Zheng, Yaowei Wang, and Changwen Chen. 2023. Prompt-based learning for unpaired image captioning. TMM (2023), 1--15."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611891","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611891","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:03:19Z","timestamp":1755820999000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611891"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":75,"alternative-id":["10.1145\/3581783.3611891","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611891","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}