{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:11:22Z","timestamp":1765357882726,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754714","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:55Z","timestamp":1761377215000},"page":"2683-2692","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["SynC: Synthetic Image Caption Dataset Refinement with One-to-many Mapping for Zero-shot Image Captioning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-3373-3390","authenticated-orcid":false,"given":"Si-Woo","family":"Kim","sequence":"first","affiliation":[{"name":"Hanyang University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9577-7400","authenticated-orcid":false,"given":"MinJu","family":"Jeon","sequence":"additional","affiliation":[{"name":"Hanyang University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0809-0047","authenticated-orcid":false,"given":"Ye-Chan","family":"Kim","sequence":"additional","affiliation":[{"name":"Hanyang University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0716-5962","authenticated-orcid":false,"given":"Soeun","family":"Lee","sequence":"additional","affiliation":[{"name":"AI R&amp;D Division, CJ Group, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2126-0704","authenticated-orcid":false,"given":"Taewhan","family":"Kim","sequence":"additional","affiliation":[{"name":"Hanyang University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7231-7494","authenticated-orcid":false,"given":"Dong-Jin","family":"Kim","sequence":"additional","affiliation":[{"name":"Hanyang University, Seoul, Republic of Korea"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00904"},{"key":"e_1_3_2_2_2_1","volume-title":"Spice: Semantic propositional image caption evaluation. In Computer Vision-ECCV 2016: 14th European Conference","author":"Anderson Peter","year":"2016","unstructured":"Peter Anderson, Basura Fernando, Mark Johnson, and Stephen Gould. 2016. Spice: Semantic propositional image caption evaluation. In Computer Vision-ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part V 14. Springer, 382-398."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01834"},{"key":"e_1_3_2_2_4_1","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65-72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65-72."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-43148-7_10"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00753"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"e_1_3_2_2_8_1","volume-title":"Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325","author":"Chen Xinlei","year":"2015","unstructured":"Xinlei Chen, Hao Fang, Tsung-Yi Lin, Ramakrishna Vedantam, Saurabh Gupta, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2015. Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)."},{"key":"e_1_3_2_2_9_1","volume-title":"Xing","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E. Gonzalez, Ion Stoica, and Eric P. Xing. 2023. Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality. https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01124"},{"key":"e_1_3_2_2_11_1","volume-title":"Testing relational understanding in text-guided image generation. arXiv preprint arXiv:2208.00005","author":"Conwell Colin","year":"2022","unstructured":"Colin Conwell and Tomer Ullman. 2022. Testing relational understanding in text-guided image generation. arXiv preprint arXiv:2208.00005 (2022)."},{"key":"e_1_3_2_2_12_1","first-page":"35544","article-title":"Improving clip training with language rewrites","volume":"36","author":"Fan Lijie","year":"2023","unstructured":"Lijie Fan, Dilip Krishnan, Phillip Isola, Dina Katabi, and Yonglong Tian. 2023. Improving clip training with language rewrites. Advances in Neural Information Processing Systems, Vol. 36 (2023), 35544-35575.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00291"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00425"},{"key":"e_1_3_2_2_15_1","volume-title":"SynthCLIP: Are We Ready for a Fully Synthetic CLIP Training? arXiv preprint arXiv:2402.01832","author":"Al Kader Hammoud Hasan Abed","year":"2024","unstructured":"Hasan Abed Al Kader Hammoud, Hani Itani, Fabio Pizzati, Philip Torr, Adel Bibi, and Bernard Ghanem. 2024. SynthCLIP: Are We Ready for a Fully Synthetic CLIP Training? arXiv preprint arXiv:2402.01832 (2024)."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01458"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_2_19_1","volume-title":"Image captioning with very scarce supervised data: Adversarial semi-supervised learning approach. arXiv preprint arXiv:1909.02201","author":"Kim Dong-Jin","year":"2019","unstructured":"Dong-Jin Kim, Jinsoo Choi, Tae-Hyun Oh, and In So Kweon. 2019. Image captioning with very scarce supervised data: Adversarial semi-supervised learning approach. arXiv preprint arXiv:1909.02201 (2019)."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2024.3423790"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3754715"},{"key":"e_1_3_2_2_22_1","volume-title":"European Conference on Computer Vision. Springer, 111-127","author":"Lai Zhengfeng","year":"2024","unstructured":"Zhengfeng Lai, Haotian Zhang, Bowen Zhang, Wentao Wu, Haoping Bai, Aleksei Timofeev, Xianzhi Du, Zhe Gan, Jiulong Shan, Chen-Nee Chuah, et al., 2024. Veclip: Improving clip training via visual-enriched captions. In European Conference on Computer Vision. Springer, 111-127."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.1153"},{"key":"e_1_3_2_2_24_1","volume-title":"Agrim Gupta, Yunzhi Zhang, Deepak Narayanan, Hannah Teufel, Marco Bellagente, et al.","author":"Lee Tony","year":"2024","unstructured":"Tony Lee, Michihiro Yasunaga, Chenlin Meng, Yifan Mai, Joon Sung Park, Agrim Gupta, Yunzhi Zhang, Deepak Narayanan, Hannah Teufel, Marco Bellagente, et al., 2024b. Holistic evaluation of text-to-image models. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_2_25_1","volume-title":"International conference on machine learning. PMLR, 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888-12900."},{"key":"e_1_3_2_2_26_1","volume-title":"Leveraging unpaired data for vision-language generative models via cycle consistency. arXiv preprint arXiv:2310.03734","author":"Li Tianhong","year":"2023","unstructured":"Tianhong Li, Sangnie Bhardwaj, Yonglong Tian, Han Zhang, Jarred Barber, Dina Katabi, Guillaume Lajoie, Huiwen Chang, and Dilip Krishnan. 2023a. Leveraging unpaired data for vision-language generative models via cycle consistency. arXiv preprint arXiv:2310.03734 (2023)."},{"key":"e_1_3_2_2_27_1","volume-title":"Decap: Decoding clip latents for zero-shot captioning via text-only training. arXiv preprint arXiv:2303.03032","author":"Li Wei","year":"2023","unstructured":"Wei Li, Linchao Zhu, Longyin Wen, and Yi Yang. 2023b. Decap: Decoding clip latents for zero-shot captioning via text-only training. arXiv preprint arXiv:2303.03032 (2023)."},{"key":"e_1_3_2_2_28_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74-81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74-81."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i4.28178"},{"key":"e_1_3_2_2_31_1","first-page":"5775","article-title":"Dpm-solver: A fast ode solver for diffusion probabilistic model sampling in around 10 steps","volume":"35","author":"Lu Cheng","year":"2022","unstructured":"Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu. 2022. Dpm-solver: A fast ode solver for diffusion probabilistic model sampling in around 10 steps. Advances in Neural Information Processing Systems, Vol. 35 (2022), 5775-5787.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_32_1","volume-title":"Unleashing Text-to-Image Diffusion Prior for Zero-Shot Image Captioning. In European Conference on Computer Vision (ECCV).","author":"Luo Jianjie","year":"2024","unstructured":"Jianjie Luo, Jingwen Chen, Yehao Li, Yingwei Pan, Jianlin Feng, Hongyang Chao, and Ting Yao. 2024. Unleashing Text-to-Image Diffusion Prior for Zero-Shot Image Captioning. In European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28203"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02116"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01428"},{"key":"e_1_3_2_2_36_1","first-page":"22047","article-title":"Improving multimodal datasets with image captioning","volume":"36","author":"Nguyen Thao","year":"2023","unstructured":"Thao Nguyen, Samir Yitzhak Gadre, Gabriel Ilharco, Sewoong Oh, and Ludwig Schmidt. 2023. Improving multimodal datasets with image captioning. Advances in Neural Information Processing Systems, Vol. 36 (2023), 22047-22069.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-emnlp.299"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3755130"},{"key":"e_1_3_2_2_39_1","volume-title":"Dong-Jin Kim, In So Kweon, and Junmo Kim.","author":"Oh Youngtaek","year":"2024","unstructured":"Youngtaek Oh, Jae Won Cho, Dong-Jin Kim, In So Kweon, and Junmo Kim. 2024. Preserving Multi-Modal Capabilities of Pre-trained VLMs for Improving Vision-Linguistic Compositionality. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing. 19060-19076."},{"key":"e_1_3_2_2_40_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318."},{"key":"e_1_3_2_2_41_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_2_42_1","volume-title":"International conference on machine learning. Pmlr, 8821-8831","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In International conference on machine learning. Pmlr, 8821-8831."},{"key":"e_1_3_2_2_43_1","volume-title":"Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. arXiv preprint arXiv:1908.10084","author":"Reimers N","year":"2019","unstructured":"N Reimers. 2019. Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. arXiv preprint arXiv:1908.10084 (2019)."},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00774"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_2_47_1","volume-title":"Ibrahim Alabdulmohsin, Nikhil Parthasarathy, Talfan Evans, Lucas Beyer, Ye Xia, Basil Mustafa, et al.","author":"Tschannen Michael","year":"2025","unstructured":"Michael Tschannen, Alexey Gritsenko, Xiao Wang, Muhammad Ferjad Naeem, Ibrahim Alabdulmohsin, Nikhil Parthasarathy, Talfan Evans, Lucas Beyer, Ye Xia, Basil Mustafa, et al., 2025. Siglip 2: Multilingual vision-language encoders with improved semantic understanding, localization, and dense features. arXiv preprint arXiv:2502.14786 (2025)."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_2_49_1","volume-title":"Minilm: Deep self-attention distillation for task-agnostic compression of pre-trained transformers. Advances in neural information processing systems","author":"Wang Wenhui","year":"2020","unstructured":"Wenhui Wang, Furu Wei, Li Dong, Hangbo Bao, Nan Yang, and Ming Zhou. 2020. Minilm: Deep self-attention distillation for task-agnostic compression of pre-trained transformers. Advances in neural information processing systems, Vol. 33 (2020), 5776-5788."},{"key":"e_1_3_2_2_50_1","volume-title":"Variance Alignment Score: A Simple But Tough-to-Beat Data Selection Method for Multimodal Contrastive Learning. arXiv preprint arXiv:2402.02055","author":"Wang Yiping","year":"2024","unstructured":"Yiping Wang, Yifang Chen, Wendan Yan, Kevin Jamieson, and Simon Shaolei Du. 2024. Variance Alignment Score: A Simple But Tough-to-Beat Data Selection Method for Multimodal Contrastive Learning. arXiv preprint arXiv:2402.02055 (2024)."},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611891"},{"key":"e_1_3_2_2_54_1","volume-title":"International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=KRLUvxh8uaX","author":"Yuksekgonul Mert","year":"2023","unstructured":"Mert Yuksekgonul, Federico Bianchi, Pratyusha Kalluri, Dan Jurafsky, and James Zou. 2023. When and why Vision-Language Models behave like Bags-of-Words, and what to do about it?. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=KRLUvxh8uaX"},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"e_1_3_2_2_56_1","volume-title":"DoraCycle: Domain-Oriented Adaptation of Unified Generative Model in Multimodal Cycles. arXiv preprint arXiv:2503.03651","author":"Zhao Rui","year":"2025","unstructured":"Rui Zhao, Weijia Mao, and Mike Zheng Shou. 2025. DoraCycle: Domain-Oriented Adaptation of Unified Generative Model in Multimodal Cycles. arXiv preprint arXiv:2503.03651 (2025)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754714","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:04:52Z","timestamp":1765343092000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754714"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":56,"alternative-id":["10.1145\/3746027.3754714","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754714","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}