{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,7]],"date-time":"2025-11-07T19:33:27Z","timestamp":1762544007150,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":64,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the Key Program of the National Natural Science Foundation of China","award":["61932009"],"award-info":[{"award-number":["61932009"]}]},{"name":"the National Natural Science Foundation of China","award":["62202439"],"award-info":[{"award-number":["62202439"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658080","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"320-329","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Pseudo Content Hallucination for Unpaired Image Captioning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7946-8199","authenticated-orcid":false,"given":"Huixia","family":"Ben","sequence":"first","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4881-9344","authenticated-orcid":false,"given":"Shuo","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3094-7735","authenticated-orcid":false,"given":"Meng","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5461-3986","authenticated-orcid":false,"given":"Richang","family":"Hong","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Spice: Semantic propositional image caption evaluation","author":"Anderson Peter","year":"2016","unstructured":"Peter Anderson, Basura Fernando, Mark Johnson, and Stephen Gould. 2016. Spice: Semantic propositional image caption evaluation. In ECCV. Springer, 382--398."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Peter Anderson Xiaodong He Chris Buehler Damien Teney Mark Johnson Stephen Gould and Lei Zhang. 2018. Bottom-up and top-down attention for image captioning and visual question answering. In CVPR. 6077--6086.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.3390\/app10175978"},{"key":"e_1_3_2_1_4_1","volume-title":"METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In ACL Workshop. 65--72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In ACL Workshop. 65--72."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3060948"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.08.019"},{"key":"e_1_3_2_1_7_1","volume-title":"Hierarchical Supervised Contrastive Learning for Multimodal Sentiment Analysis. In International Conference on Multimedia Modeling. Springer Nature Switzerland, 56--69","author":"Chen Kezhou","year":"2024","unstructured":"Kezhou Chen, Shuo Wang, and Yanbin Hao. 2024. Hierarchical Supervised Contrastive Learning for Multimodal Sentiment Analysis. In International Conference on Multimedia Modeling. Springer Nature Switzerland, 56--69."},{"key":"e_1_3_2_1_8_1","volume-title":"A semi-supervised framework for image captioning. arXiv preprint arXiv:1611.05321","author":"Chen Wenhu","year":"2016","unstructured":"Wenhu Chen, Aurelien Lucchi, and Thomas Hofmann. 2016. A semi-supervised framework for image captioning. arXiv preprint arXiv:1611.05321 (2016)."},{"key":"e_1_3_2_1_9_1","volume-title":"Devi Parikh, and Dhruv Batra.","author":"Das Abhishek","year":"2017","unstructured":"Abhishek Das, Satwik Kottur, Khushi Gupta, Avi Singh, Deshraj Yadav, Jos\u00e9 MF Moura, Devi Parikh, and Dhruv Batra. 2017. Visual dialog. In CVPR. 326--335."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-022-12042-8"},{"key":"e_1_3_2_1_11_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_12_1","volume-title":"Improved regularization of convolutional neural networks with cutout. arXiv preprint arXiv:1708.04552","author":"DeVries Terrance","year":"2017","unstructured":"Terrance DeVries and Graham W Taylor. 2017. Improved regularization of convolutional neural networks with cutout. arXiv preprint arXiv:1708.04552 (2017)."},{"key":"e_1_3_2_1_13_1","volume-title":"Patchup: A regularization technique for convolutional neural networks.","author":"Faramarzi Mojtaba","year":"2022","unstructured":"Mojtaba Faramarzi, Mohammad Amini, Akilesh Badrinaaraayanan, Vikas Verma, and Sarath Chandar. 2022. Patchup: A regularization technique for convolutional neural networks. (2022), 589--597."},{"key":"e_1_3_2_1_14_1","volume-title":"Automatic caption generation for news images","author":"Feng Yansong","year":"2012","unstructured":"Yansong Feng and Mirella Lapata. 2012. Automatic caption generation for news images. IEEE transactions on pattern analysis and machine intelligence, Vol. 35, 4 (2012), 797--812."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Yang Feng Lin Ma Wei Liu and Jiebo Luo. 2019. Unsupervised image captioning. In CVPR. 4125--4134.","DOI":"10.1109\/CVPR.2019.00425"},{"key":"e_1_3_2_1_16_1","volume-title":"UNISON: Unpaired Cross-Lingual Image Captioning. In AAAI. 10654--10662.","author":"Gao Jiahui","year":"2022","unstructured":"Jiahui Gao, Yi Zhou, LH Philip, Shafiq Joty, and Jiuxiang Gu. 2022. UNISON: Unpaired Cross-Lingual Image Captioning. In AAAI. 10654--10662."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Jiuxiang Gu Shafiq Joty Jianfei Cai and Gang Wang. 2018. Unpaired image captioning by language pivoting. In ECCV. 503--519.","DOI":"10.1007\/978-3-030-01246-5_31"},{"key":"e_1_3_2_1_18_1","unstructured":"Jiuxiang Gu Shafiq Joty Jianfei Cai Handong Zhao Xu Yang and Gang Wang. 2019. Unpaired image captioning via scene graph alignments. In ICCV. 10323--10332."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Dan Guo Shuo Wang Qi Tian and Meng Wang. 2019. Dense Temporal Convolution Network for Sign Language Translation. In IJCAI. 744--750.","DOI":"10.24963\/ijcai.2019\/105"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Dan Guo Yang Wang Peipei Song and Meng Wang. 2021. Recurrent relational memory network for unsupervised image captioning. In IJCAI. 920--926.","DOI":"10.24963\/ijcai.2020\/128"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Danna Gurari Qing Li Abigale J Stangl Anhong Guo Chi Lin Kristen Grauman Jiebo Luo and Jeffrey P Bigham. 2018. Vizwiz grand challenge: Answering visual questions from blind people. In CVPR. 3608--3617.","DOI":"10.1109\/CVPR.2018.00380"},{"volume-title":"Captioning images taken by people who are blind","author":"Gurari Danna","key":"e_1_3_2_1_22_1","unstructured":"Danna Gurari, Yinan Zhao, Meng Zhang, and Nilavra Bhattacharya. 2020. Captioning images taken by people who are blind. In ECCV. Springer, 417--434."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3169842"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3221292"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Ukyo Honda Yoshitaka Ushiku Atsushi Hashimoto Taro Watanabe and Yuji Matsumoto. 2021. Removing Word-Level Spurious Alignment between Images and Pseudo-Captions in Unsupervised Image Captioning. In EACL. 3692--3702.","DOI":"10.18653\/v1\/2021.eacl-main.323"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Yicong Hong Qi Wu Yuankai Qi Cristian Rodriguez-Opazo and Stephen Gould. 2021. Vln bert: A recurrent vision-and-language bert for navigation. In CVPR. 1643--1653.","DOI":"10.1109\/CVPR46437.2021.00169"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Jonathan Huang Vivek Rathod Chen Sun Menglong Zhu Anoop Korattikara Alireza Fathi Ian Fischer Zbigniew Wojna Yang Song Sergio Guadarrama et al. 2017. Speed\/accuracy trade-offs for modern convolutional object detectors. In CVPR. 7310--7311.","DOI":"10.1109\/CVPR.2017.351"},{"key":"e_1_3_2_1_28_1","volume-title":"Image captioning using deep stacked LSTMs, contextual word embeddings and data augmentation. arXiv preprint arXiv:2102.11237","author":"Katiyar Sulabh","year":"2021","unstructured":"Sulabh Katiyar and Samir Kumar Borgohain. 2021. Image captioning using deep stacked LSTMs, contextual word embeddings and data augmentation. arXiv preprint arXiv:2102.11237 (2021)."},{"key":"e_1_3_2_1_29_1","volume-title":"Image captioning with very scarce supervised data: Adversarial semi-supervised learning approach. arXiv preprint arXiv:1909.02201","author":"Kim Dong-Jin","year":"2019","unstructured":"Dong-Jin Kim, Jinsoo Choi, Tae-Hyun Oh, and In So Kweon. 2019. Image captioning with very scarce supervised data: Adversarial semi-supervised learning approach. arXiv preprint arXiv:1909.02201 (2019)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Ranjay Krishna Yuke Zhu Oliver Groth Justin Johnson Kenji Hata Joshua Kravitz Stephanie Chen Yannis Kalantidis Li-Jia Li David A Shamma et al. 2017. Visual genome: Connecting language and vision using crowdsourced dense image annotations. International journal of computer vision Vol. 123 1 (2017) 32--73.","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Iro Laina Christian Rupprecht and Nassir Navab. 2019. Towards Unsupervised Image Captioning with Shared Multimodal Embeddings. In ICCV. 7414--7424.","DOI":"10.1109\/ICCV.2019.00751"},{"key":"e_1_3_2_1_32_1","volume-title":"ROUGE: A Package for Automatic Evaluation of Summaries. In ACL. 74--81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. ROUGE: A Package for Automatic Evaluation of Summaries. In ACL. 74--81."},{"volume-title":"Microsoft coco: Common objects in context","author":"Lin Tsung-Yi","key":"e_1_3_2_1_33_1","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In ECCV. Springer, 740--755."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i10.17047"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Fenglin Liu Meng Gao Tianhao Zhang and Yuexian Zou. 2019. Exploring semantic relationships for image captioning without parallel data. In ICDM. 439--448.","DOI":"10.1109\/ICDM.2019.00054"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611784"},{"key":"e_1_3_2_1_37_1","unstructured":"Jiasen Lu Caiming Xiong Devi Parikh and Richard Socher. 2017. Knowing when to look: Adaptive attention via a visual sentinel for image captioning. In CVPR. 375--383."},{"key":"e_1_3_2_1_38_1","first-page":"3613","article-title":"Diverse image captioning with context-object split latent spaces","volume":"33","author":"Mahajan Shweta","year":"2020","unstructured":"Shweta Mahajan and Stefan Roth. 2020. Diverse image captioning with context-object split latent spaces. NeuralIPS , Vol. 33 (2020), 3613--3624.","journal-title":"NeuralIPS"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Zihang Meng David Yang Xuefei Cao Ashish Shah and Ser-Nam Lim. 2022. Object-Centric Unsupervised Image Captioning. In ECCV. 219--235.","DOI":"10.1007\/978-3-031-20059-5_13"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Kishore Papineni Salim Roukos Todd Ward and Wei-Jing Zhu. 2002. Bleu: a Method for Automatic Evaluation of Machine Translation. In ACL. 311--318.","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Bryan A Plummer Liwei Wang Chris M Cervantes Juan C Caicedo Julia Hockenmaier and Svetlana Lazebnik. 2015. Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In ICCV. 2641--2649.","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_42_1","unstructured":"Shaoqing Ren Kaiming He Ross Girshick and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. In NeurIPS. 91--99."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Piyush Sharma Nan Ding Sebastian Goodman and Radu Soricut. 2018. Conceptual captions: A cleaned hypernymed image alt-text dataset for automatic image captioning. In ACL. 2556--2565.","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_1_44_1","volume-title":"Memorial GAN With Joint Semantic Optimization for Unpaired Image Captioning. TCyber","author":"Song Peipei","year":"2022","unstructured":"Peipei Song, Dan Guo, Jinxing Zhou, Mingliang Xu, and Meng Wang. 2022. Memorial GAN With Joint Semantic Optimization for Unpaired Image Captioning. TCyber (2022)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Yuqing Song Shizhe Chen Yida Zhao and Qin Jin. 2019. Unpaired cross-lingual image caption generation with self-supervised rewards. In ACM MM. 784--792.","DOI":"10.1145\/3343031.3350996"},{"key":"e_1_3_2_1_46_1","volume-title":"Dropout from higher education: A theoretical synthesis of recent research. Review of educational research","author":"Tinto Vincent","year":"1975","unstructured":"Vincent Tinto. 1975. Dropout from higher education: A theoretical synthesis of recent research. Review of educational research , Vol. 45, 1 (1975), 89--125."},{"key":"e_1_3_2_1_47_1","volume-title":"Cider: Consensus-based image description evaluation. In CVPR. 4566--4575.","author":"Vedantam Ramakrishna","year":"2015","unstructured":"Ramakrishna Vedantam, C Lawrence Zitnick, and Devi Parikh. 2015. Cider: Consensus-based image description evaluation. In CVPR. 4566--4575."},{"key":"e_1_3_2_1_48_1","unstructured":"Vikas Verma Alex Lamb Christopher Beckham Amir Najafi Ioannis Mitliagkas David Lopez-Paz and Yoshua Bengio. 2019. Manifold mixup: Better representations by interpolating hidden states. In ICML. PMLR 6438--6447."},{"key":"e_1_3_2_1_49_1","volume-title":"Show and tell: Lessons learned from the 2015 mscoco image captioning challenge","author":"Vinyals Oriol","year":"2016","unstructured":"Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. 2016. Show and tell: Lessons learned from the 2015 mscoco image captioning challenge. IEEE transactions on pattern analysis and machine intelligence, Vol. 39, 4 (2016), 652--663."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3115432","article-title":"Image captioning with deep bidirectional LSTMs and multi-task learning","volume":"14","author":"Wang Cheng","year":"2018","unstructured":"Cheng Wang, Haojin Yang, and Christoph Meinel. 2018b. Image captioning with deep bidirectional LSTMs and multi-task learning. TOMM, Vol. 14, 2s (2018), 1--20.","journal-title":"TOMM"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3522713"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3314577"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240671"},{"key":"e_1_3_2_1_54_1","volume-title":"Proceedings, Part X 16","author":"Wang Shuo","year":"2020","unstructured":"Shuo Wang, Jun Yue, Jianzhuang Liu, Qi Tian, and Meng Wang. 2020. Large-scale Few-shot Learning via Multi-modal Knowledge Discovery. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part X 16. Springer International Publishing, 718--734."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547837"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/2998181.2998364"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"Jiarui Yu Haoran Li Yanbin Hao Bin Zhu Tong Xu and Xiangnan He. 2023. CgT-GAN: CLIP-guided Text GAN for Image Captioning. In ACM MM. 2252--2263.","DOI":"10.1145\/3581783.3611891"},{"key":"e_1_3_2_1_58_1","volume-title":"Sanghyuk Chun, Junsuk Choe, and Youngjoon Yoo.","author":"Yun Sangdoo","year":"2019","unstructured":"Sangdoo Yun, Dongyoon Han, Seong Joon Oh, Sanghyuk Chun, Junsuk Choe, and Youngjoon Yoo. 2019. Cutmix: Regularization strategy to train strong classifiers with localizable features. In ICCV. 6023--6032."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"crossref","unstructured":"Rowan Zellers Mark Yatskar Sam Thomson and Yejin Choi. 2018. Neural motifs: Scene graph parsing with global context. In CVPR. 5831--5840.","DOI":"10.1109\/CVPR.2018.00611"},{"key":"e_1_3_2_1_60_1","unstructured":"Hongyi Zhang Moustapha Cisse Yann N Dauphin and David Lopez-Paz. 2018. mixup: Beyond Empirical Risk Minimization. In ICLR."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.2976552"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"crossref","unstructured":"Yucheng Zhou Wei Tao and Wenqiang Zhang. 2021. Triple sequence generative adversarial nets for unsupervised image captioning. In ICASSP. 7598--7602.","DOI":"10.1109\/ICASSP39728.2021.9414335"},{"key":"e_1_3_2_1_63_1","volume-title":"Unpaired Image Captioning by Image-level Weakly-Supervised Visual Concept Recognition. TMM","author":"Zhu Peipei","year":"2022","unstructured":"Peipei Zhu, Xiao Wang, Yong Luo, Zhenglong Sun, Wei-Shi Zheng, Yaowei Wang, and Changwen Chen. 2022. Unpaired Image Captioning by Image-level Weakly-Supervised Visual Concept Recognition. TMM (2022), 1--15."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28614"}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"],"location":"Phuket Thailand","acronym":"ICMR '24"},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658080","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658080","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:53:42Z","timestamp":1755766422000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658080"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":64,"alternative-id":["10.1145\/3652583.3658080","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658080","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}