{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:57:33Z","timestamp":1781539053681,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810851","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"826-834","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["SEEDCAP: SEMANTIC EXPANSION AND ENTITY-DRIVEN ZERO-SHOT IMAGE CAPTIONING"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-4886-1267","authenticated-orcid":false,"given":"Binbin","family":"Li","sequence":"first","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China and Institute of Information Engineering,Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5953-2473","authenticated-orcid":false,"given":"Shupei","family":"Xiao","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China and Institute of Information Engineering,Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8604-7226","authenticated-orcid":false,"given":"Dayan","family":"Wu","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering,Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8407-8648","authenticated-orcid":false,"given":"Gengqi","family":"Yang","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Biejing, China and Institute of Information Engineering,Chinese Academy of Sciences, Biejing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1631-4602","authenticated-orcid":false,"given":"Siyu","family":"Jia","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering,Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6883-6293","authenticated-orcid":false,"given":"Zisen","family":"Qi","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering,Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","unstructured":"Nayyer Aafaq Ajmal Mian Wei Liu Syed\u00a0Zulqarnain Gilani and Mubarak Shah. 2019. Video Description: A Survey of Methods Datasets and Evaluation Metrics. Comput. Surveys 52 6 (October 2019) 115:1\u2013115:37. 10.1145\/3355390","DOI":"10.1145\/3355390"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00904"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"crossref","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et\u00a0al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems 35 (2022) 23716\u201323736.","DOI":"10.52202\/068431-1723"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"e_1_3_3_1_6_2","first-page":"65","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65\u201372."},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00512"},{"key":"e_1_3_3_1_8_2","unstructured":"Xi Chen Josip Djolonga Piotr Padlewski Basil Mustafa Soravit Changpinyo Jialin Wu Carlos\u00a0Riquelme Ruiz Sebastian Goodman Xiao Wang Yi Tay et\u00a0al. 2023. Pali-x: On scaling up a multilingual vision and language model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.18565 (2023)."},{"key":"e_1_3_3_1_9_2","unstructured":"Xinlei Chen Hao Fang Tsung-Yi Lin Ramakrishna Vedantam Saurabh Gupta Piotr Doll\u00e1r and C.\u00a0Lawrence Zitnick. 2015. Microsoft COCO captions: Data collection and evaluation server. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1504.00325 (2015)."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00291"},{"key":"e_1_3_3_1_12_2","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems 33 (2020) 6840\u20136851."},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_3_1_14_2","unstructured":"Diederik\u00a0P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1412.6980 (2014)."},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00383"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"Soeun Lee Si-Woo Kim Taewhan Kim and Dong-Jin Kim. 2024. Ifcap: Image-like retrieval and frequency-based entity filtering for zero-shot captioning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.18046 (2024).","DOI":"10.18653\/v1\/2024.emnlp-main.1153"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01303"},{"key":"e_1_3_3_1_18_2","unstructured":"Wei Li Linchao Zhu Longyin Wen and Yi Yang. 2023. Decap: Decoding clip latents for zero-shot captioning via text-only training. arXiv preprint (2023)."},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i4.28178"},{"key":"e_1_3_3_1_20_2","first-page":"237","volume-title":"European Conference on Computer Vision","author":"Luo Jianjie","year":"2024","unstructured":"Jianjie Luo, Jingwen Chen, Yehao Li, Yingwei Pan, Jianlin Feng, Hongyang Chao, and Ting Yao. 2024. Unleashing Text-to-Image Diffusion Prior for Zero-Shot Image Captioning. In European Conference on Computer Vision. Springer, 237\u2013254."},{"key":"e_1_3_3_1_21_2","unstructured":"Ron Mokady Amir Hertz and Amit\u00a0H Bermano. 2021. Clipcap: Clip prefix for image captioning. arXiv preprint (2021)."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"crossref","unstructured":"David Nukrai Ron Mokady and Amir Globerson. 2022. Text-only training for image captioning using noise-injected clip. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2211.00575 (2022).","DOI":"10.18653\/v1\/2022.findings-emnlp.299"},{"key":"e_1_3_3_1_23_2","first-page":"311","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28260"},{"key":"e_1_3_3_1_25_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."},{"key":"e_1_3_3_1_26_2","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et\u00a0al. 2019. Language models are unsupervised multitask learners. OpenAI blog 1 8 (2019) 9."},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00278"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_1_29_2","unstructured":"Yang Song Prafulla Dhariwal Mark Chen and Ilya Sutskever. 2023. Consistency models. (2023)."},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01739"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"crossref","unstructured":"Junyang Wang Ming Yan Yi Zhang and Jitao Sang. 2023. From association to generation: Text-only captioning by unsupervised cross-modal mapping. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.13273 (2023).","DOI":"10.24963\/ijcai.2023\/481"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"crossref","unstructured":"Jie Yan Yuxiang Xie Shiwei Zou Yingmei Wei and Xidao Luan. 2025. EntroCap: Zero-shot image captioning with entropy-based retrieval. Neurocomputing 611 (2025) 128666.","DOI":"10.1016\/j.neucom.2024.128666"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"crossref","unstructured":"Peter Young Alice Lai Micah Hodosh and Julia Hockenmaier. 2014. From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. Transactions of the Association for Computational Linguistics 2 (2014) 67\u201378.","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i21.34386"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01337"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:42:14Z","timestamp":1781538134000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810851"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":36,"alternative-id":["10.1145\/3805622.3810851","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810851","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}