{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T17:20:20Z","timestamp":1765041620552,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,3]]},"DOI":"10.1145\/3696409.3700223","type":"proceedings-article","created":{"date-parts":[[2024,12,28]],"date-time":"2024-12-28T09:55:23Z","timestamp":1735379723000},"page":"1-8","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Exploring Annotation-free Image Captioning with Retrieval-augmented Pseudo Sentence Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7554-0536","authenticated-orcid":false,"given":"Zhiyuan","family":"Li","sequence":"first","affiliation":[{"name":"University of Sydney, Sydney, NSW, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8102-3949","authenticated-orcid":false,"given":"Dongnan","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Sydney, Sydney, NSW, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5473-5751","authenticated-orcid":false,"given":"Heng","family":"Wang","sequence":"additional","affiliation":[{"name":"The University of Sydney, Sydney, NSW, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8492-9711","authenticated-orcid":false,"given":"Chaoyi","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Sydney, Sydney, NSW, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3706-8896","authenticated-orcid":false,"given":"Weidong","family":"Cai","sequence":"additional","affiliation":[{"name":"University of Sydney, Sydney, NSW, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,12,28]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et\u00a0al. 2022. Flamingo: a visual language model for few-shot learning. Advances in Neural Information Processing Systems 35 (2022) 23716\u201323736."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"e_1_3_3_1_4_2","first-page":"65","volume-title":"Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization. 65\u201372. https:\/\/dl.acm.org\/doi\/10.5555\/1626355.1626389"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"crossref","unstructured":"Huixia Ben Yingwei Pan Yehao Li Ting Yao Richang Hong Meng Wang and Tao Mei. 2021. Unpaired image captioning with semantic-constrained self-learning. IEEE Transactions on Multimedia 24 (2021) 904\u2013916.","DOI":"10.1109\/TMM.2021.3060948"},{"key":"e_1_3_3_1_6_2","first-page":"1059","volume-title":"International Conference on Machine Learning","author":"Brock Andy","year":"2021","unstructured":"Andy Brock, Soham De, Samuel\u00a0L Smith, and Karen Simonyan. 2021. High-performance large-scale image recognition without normalization. In International Conference on Machine Learning. PMLR, 1059\u20131071."},{"key":"e_1_3_3_1_7_2","unstructured":"Minwoo Byeon Beomhee Park Haecheon Kim Sungjun Lee Woonhyuk Baek and Saehoon Kim. 2022. COYO-700M: Image-Text Pair Dataset. https:\/\/github.com\/kakaobrain\/coyo-dataset."},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01750"},{"key":"e_1_3_3_1_9_2","unstructured":"Jun Chen Deyao Zhu Xiaoqian Shen Xiang Li Zechun Liu Pengchuan Zhang Raghuraman Krishnamoorthi Vikas Chandra Yunyang Xiong and Mohamed Elhoseiny. 2023. Minigpt-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.09478 (2023)."},{"key":"e_1_3_3_1_10_2","unstructured":"Wenhu Chen Hexiang Hu Chitwan Saharia and William\u00a0W Cohen. 2022. Re-imagen: Retrieval-augmented text-to-image generator. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2209.14491 (2022)."},{"key":"e_1_3_3_1_11_2","unstructured":"Xinlei Chen Hao Fang Tsung-Yi Lin Ramakrishna Vedantam Saurabh Gupta Piotr Doll\u00e1r and C\u00a0Lawrence Zitnick. 2015. Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1504.00325 (2015)."},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00059"},{"key":"e_1_3_3_1_13_2","unstructured":"Wei-Lin Chiang Zhuohan Li Zi Lin Ying Sheng Zhanghao Wu Hao Zhang Lianmin Zheng Siyuan Zhuang Yonghao Zhuang Joseph\u00a0E Gonzalez et\u00a0al. 2023. Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna. lmsys. org (accessed 14 April 2023) 2 3 (2023) 6."},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"crossref","unstructured":"Jaemin Cho Seunghyun Yoon Ajinkya Kale Franck Dernoncourt Trung Bui and Mohit Bansal. 2022. Fine-grained image captioning with clip reward. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2205.13115 (2022).","DOI":"10.18653\/v1\/2022.findings-naacl.39"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"e_1_3_3_1_16_2","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et\u00a0al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.11929 (2020)."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00425"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"crossref","unstructured":"Dan Guo Yang Wang Peipei Song and Meng Wang. 2020. Recurrent relational memory network for unsupervised image captioning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2006.13611 (2020).","DOI":"10.24963\/ijcai.2020\/128"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"Jack Hessel Ari Holtzman Maxwell Forbes Ronan\u00a0Le Bras and Yejin Choi. 2021. Clipscore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2104.08718 (2021).","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"e_1_3_3_1_20_2","unstructured":"Jordan Hoffmann Sebastian Borgeaud Arthur Mensch Elena Buchatskaya Trevor Cai Eliza Rutherford Diego de\u00a0Las Casas Lisa\u00a0Anne Hendricks Johannes Welbl Aidan Clark et\u00a0al. 2022. Training compute-optimal large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2203.15556 (2022)."},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"crossref","unstructured":"Ukyo Honda Yoshitaka Ushiku Atsushi Hashimoto Taro Watanabe and Yuji Matsumoto. 2021. Removing word-level spurious alignment between images and pseudo-captions in unsupervised image captioning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2104.13872 (2021).","DOI":"10.18653\/v1\/2021.eacl-main.323"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/105"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"crossref","unstructured":"Ranjay Krishna Yuke Zhu Oliver Groth Justin Johnson Kenji Hata Joshua Kravitz Stephanie Chen Yannis Kalantidis Li-Jia Li David\u00a0A Shamma et\u00a0al. 2017. Visual genome: Connecting language and vision using crowdsourced dense image annotations. International Journal of Computer Vision 123 1 (2017) 32\u201373.","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01744"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00751"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Mike Lewis Yinhan Liu Naman Goyal Marjan Ghazvininejad Abdelrahman Mohamed Omer Levy Ves Stoyanov and Luke Zettlemoyer. 2019. Bart: Denoising sequence-to-sequence pre-training for natural language generation translation and comprehension. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1910.13461 (2019).","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00902"},{"key":"e_1_3_3_1_29_2","unstructured":"Junnan Li Dongxu Li Silvio Savarese and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2301.12597 (2023)."},{"key":"e_1_3_3_1_30_2","first-page":"12888","volume-title":"International Conference on Machine Learning","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888\u201312900."},{"key":"e_1_3_3_1_31_2","unstructured":"Wei Li Linchao Zhu Longyin Wen and Yi Yang. 2023. Decap: Decoding clip latents for zero-shot captioning via text-only training. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.03032 (2023)."},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/DICTA60407.2023.00058"},{"key":"e_1_3_3_1_34_2","unstructured":"Zhiyuan Li Dongnan Liu Chaoyi Zhang Heng Wang Tengfei Xue and Weidong Cai. 2024. Enhancing Advanced Visual Reasoning Ability of Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.13980 (2024)."},{"key":"e_1_3_3_1_35_2","unstructured":"Zhiyuan Li Heng Wang Dongnan Liu Chaoyi Zhang Ao Ma Jieting Long and Weidong Cai. 2024. Multimodal Causal Reasoning Benchmark: Challenging Vision Large Language Models to Infer Causal Links Between Siamese Images. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.08105 (2024)."},{"key":"e_1_3_3_1_36_2","first-page":"74","volume-title":"Text Summarization Branches Out","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text Summarization Branches Out. 74\u201381."},{"key":"e_1_3_3_1_37_2","unstructured":"Weizhe Lin and Bill Byrne. 2022. Retrieval augmented visual question answering with outside knowledge. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.03809 (2022)."},{"key":"e_1_3_3_1_38_2","unstructured":"Fenglin Liu Meng Gao Tianhao Zhang and Yuexian Zou. 2021. Exploring semantic relationships for unpaired image captioning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2106.10658 (2021)."},{"key":"e_1_3_3_1_39_2","unstructured":"Haotian Liu Chunyuan Li Yuheng Li and Yong\u00a0Jae Lee. 2023. Improved baselines with visual instruction tuning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.03744 (2023)."},{"key":"e_1_3_3_1_40_2","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong\u00a0Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems 36 (2024)."},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i3.16328"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"crossref","unstructured":"David Nukrai Ron Mokady and Amir Globerson. 2022. Text-only training for image captioning using noise-injected clip. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2211.00575 (2022).","DOI":"10.18653\/v1\/2022.findings-emnlp.299"},{"key":"e_1_3_3_1_43_2","unstructured":"Aaron van\u00a0den Oord Yazhe Li and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1807.03748 (2018)."},{"key":"e_1_3_3_1_44_2","first-page":"311","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318. https:\/\/dl.acm.org\/doi\/10.3115\/1073083.1073135"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_3_1_46_2","first-page":"8748","volume-title":"International Conference on Machine Learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748\u20138763."},{"key":"e_1_3_3_1_47_2","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et\u00a0al. 2019. Language models are unsupervised multitask learners. OpenAI blog 1 8 (2019) 9."},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"crossref","unstructured":"Nils Reimers and Iryna Gurevych. 2019. Sentence-BERT: Sentence embeddings using siamese bert-networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1908.10084 (2019).","DOI":"10.18653\/v1\/D19-1410"},{"key":"e_1_3_3_1_49_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar et\u00a0al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971 (2023)."},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_3_1_51_2","unstructured":"Zirui Wang Jiahui Yu Adams\u00a0Wei Yu Zihang Dai Yulia Tsvetkov and Yuan Cao. 2021. Simvlm: Simple visual language model pretraining with weak supervision. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2108.10904 (2021)."},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01749"},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"crossref","unstructured":"Zhuolin Yang Wei Ping Zihan Liu Vijay Korthikanti Weili Nie De-An Huang Linxi Fan Zhiding Yu Shiyi Lan Bo Li et\u00a0al. 2023. Re-vilm: Retrieval-augmented visual language model for zero and few-shot image captioning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.04858 (2023).","DOI":"10.18653\/v1\/2023.findings-emnlp.793"},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611891"},{"key":"e_1_3_3_1_55_2","doi-asserted-by":"publisher","DOI":"10.1145\/3511808.3557382"},{"key":"e_1_3_3_1_56_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414335"},{"key":"e_1_3_3_1_57_2","unstructured":"Deyao Zhu Jun Chen Xiaoqian Shen Xiang Li and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.10592 (2023)."},{"key":"e_1_3_3_1_58_2","unstructured":"Peipei Zhu Xiao Wang Yong Luo Zhenglong Sun Wei-Shi Zheng Yaowei Wang and Changwen Chen. 2022. Unpaired Image Captioning by Image-level Weakly-Supervised Visual Concept Recognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2203.03195 (2022)."},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"crossref","unstructured":"Peipei Zhu Xiao Wang Lin Zhu Zhenglong Sun Wei-Shi Zheng Yaowei Wang and Changwen Chen. 2023. Prompt-based learning for unpaired image captioning. IEEE Transactions on Multimedia (2023).","DOI":"10.1109\/TMM.2023.3265842"}],"event":{"name":"MMAsia '24: ACM Multimedia Asia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Auckland New Zealand","acronym":"MMAsia '24"},"container-title":["Proceedings of the 6th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700223","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696409.3700223","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:10:15Z","timestamp":1750295415000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700223"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"references-count":58,"alternative-id":["10.1145\/3696409.3700223","10.1145\/3696409"],"URL":"https:\/\/doi.org\/10.1145\/3696409.3700223","relation":{},"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"2024-12-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}