{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:14:50Z","timestamp":1765008890800,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,9]]},"DOI":"10.1145\/3743093.3770987","type":"proceedings-article","created":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:08:11Z","timestamp":1765008491000},"page":"1-8","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["DualCap: Enhancing Lightweight Image Captioning via Dual Retrieval with Similar Scenes Visual Prompts"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-4886-1267","authenticated-orcid":false,"given":"Binbin","family":"Li","sequence":"first","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China and School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4672-6867","authenticated-orcid":false,"given":"Guimiao","family":"Yang","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China and School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6883-6293","authenticated-orcid":false,"given":"Zisen","family":"Qi","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4049-0466","authenticated-orcid":false,"given":"Haiping","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0272-2042","authenticated-orcid":false,"given":"Yu","family":"Ding","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,12,6]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Peter Anderson Basura Fernando Mark Johnson and Stephen Gould. 2016. SPICE: Semantic Propositional Image Caption Evaluation. arxiv:https:\/\/arXiv.org\/abs\/1607.08822\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/1607.08822"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"crossref","unstructured":"Manuele Barraco Matteo Stefanini Marcella Cornia Silvia Cascianelli Lorenzo Baraldi and Rita Cucchiara. 2022. CaMEL: Mean Teacher Learning for Image Captioning. arxiv:https:\/\/arXiv.org\/abs\/2202.10492\u00a0[cs.CV]","DOI":"10.1109\/ICPR56361.2022.9955644"},{"key":"e_1_3_3_1_4_2","unstructured":"Xi Chen Xiao Wang Soravit Changpinyo AJ Piergiovanni Piotr Padlewski Daniel Salz Sebastian Goodman Adam Grycner Basil Mustafa Lucas Beyer Alexander Kolesnikov Joan Puigcerver Nan Ding Keran Rong Hassan Akbari Gaurav Mishra Linting Xue Ashish Thapliyal James Bradbury Weicheng Kuo Mojtaba Seyedhosseini Chao Jia Burcu\u00a0Karagol Ayan Carlos Riquelme Andreas Steiner Anelia Angelova Xiaohua Zhai Neil Houlsby and Radu Soricut. 2023. PaLI: A Jointly-Scaled Multilingual Language-Image Model. arxiv:https:\/\/arXiv.org\/abs\/2209.06794\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2209.06794"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-3348"},{"key":"e_1_3_3_1_6_2","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. arxiv:https:\/\/arXiv.org\/abs\/2010.11929\u00a0[cs.CV]"},{"key":"e_1_3_3_1_7_2","unstructured":"Ziniu Hu Ahmet Iscen Chen Sun Zirui Wang Kai-Wei Chang Yizhou Sun Cordelia Schmid David\u00a0A. Ross and Alireza Fathi. 2023. REVEAL: Retrieval-Augmented Visual-Language Pre-Training with Multi-Source Multimodal Knowledge Memory. arxiv:https:\/\/arXiv.org\/abs\/2212.05221\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2212.05221"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"crossref","unstructured":"MinJu Jeon Siwoo Kim Soeun Lee and Dong-Jin Kim. 2024. Enhancing Lightweight Image Captioning with Localized Features and Keyword Extraction. SSRN (2024). https:\/\/ssrn.com\/abstract=4866145","DOI":"10.2139\/ssrn.4866145"},{"key":"e_1_3_3_1_9_2","unstructured":"Menglin Jia Luming Tang Bor-Chun Chen Claire Cardie Serge Belongie Bharath Hariharan and Ser-Nam Lim. 2022. Visual Prompt Tuning. arxiv:https:\/\/arXiv.org\/abs\/2203.12119\u00a0[cs.CV]"},{"key":"e_1_3_3_1_10_2","unstructured":"Jeff Johnson Matthijs Douze and Herv\u00e9 J\u00e9gou. 2017. Billion-scale similarity search with GPUs. arxiv:https:\/\/arXiv.org\/abs\/1702.08734\u00a0[cs.CV]"},{"key":"e_1_3_3_1_11_2","unstructured":"Andrej Karpathy and Li Fei-Fei. 2015. Deep Visual-Semantic Alignments for Generating Image Descriptions. arxiv:https:\/\/arXiv.org\/abs\/1412.2306\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/1412.2306"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i4.32454"},{"key":"e_1_3_3_1_13_2","unstructured":"Diederik\u00a0P Kingma and Max Welling. 2022. Auto-Encoding Variational Bayes. arxiv:https:\/\/arXiv.org\/abs\/1312.6114\u00a0[stat.ML]"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"crossref","unstructured":"Brian Lester Rami Al-Rfou and Noah Constant. 2021. The Power of Scale for Parameter-Efficient Prompt Tuning. arxiv:https:\/\/arXiv.org\/abs\/2104.08691\u00a0[cs.CL]","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"e_1_3_3_1_15_2","unstructured":"Junnan Li Dongxu Li Silvio Savarese and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2301.12597\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2301.12597"},{"key":"e_1_3_3_1_16_2","unstructured":"Junnan Li Dongxu Li Caiming Xiong and Steven Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. arxiv:https:\/\/arXiv.org\/abs\/2201.12086\u00a0[cs.CV]"},{"key":"e_1_3_3_1_17_2","unstructured":"Junnan Li Ramprasaath\u00a0R. Selvaraju Akhilesh\u00a0Deepak Gotmare Shafiq Joty Caiming Xiong and Steven Hoi. 2021. Align before Fuse: Vision and Language Representation Learning with Momentum Distillation. arxiv:https:\/\/arXiv.org\/abs\/2107.07651\u00a0[cs.CV]"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01303"},{"key":"e_1_3_3_1_19_2","unstructured":"Xiang\u00a0Lisa Li and Percy Liang. 2021. Prefix-Tuning: Optimizing Continuous Prompts for Generation. arxiv:https:\/\/arXiv.org\/abs\/2101.00190\u00a0[cs.CL]"},{"key":"e_1_3_3_1_20_2","unstructured":"Yin Li Qi Chen Kai Wang Meige Li Liping Si Yingwei Guo Yu Xiong Qixing Wang Yang Qin Ling Xu Patrick van\u00a0der Smagt Jun Tang and Nutan Chen. 2024. A dataset of primary nasopharyngeal carcinoma MRI with multi-modalities segmentation. arxiv:https:\/\/arXiv.org\/abs\/2404.03253\u00a0[eess.IV] https:\/\/arxiv.org\/abs\/2404.03253"},{"key":"e_1_3_3_1_21_2","unstructured":"Tsung-Yi Lin Michael Maire Serge Belongie Lubomir Bourdev Ross Girshick James Hays Pietro Perona Deva Ramanan C.\u00a0Lawrence Zitnick and Piotr Doll\u00e1r. 2015. Microsoft COCO: Common Objects in Context. arxiv:https:\/\/arXiv.org\/abs\/1405.0312\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/1405.0312"},{"key":"e_1_3_3_1_22_2","unstructured":"Xi\u00a0Victoria Lin Todor Mihaylov Mikel Artetxe Tianlu Wang Shuohui Chen Daniel Simig Myle Ott Naman Goyal Shruti Bhosale Jingfei Du Ramakanth Pasunuru Sam Shleifer Punit\u00a0Singh Koura Vishrav Chaudhary Brian O\u2019Horo Jeff Wang Luke Zettlemoyer Zornitsa Kozareva Mona Diab Veselin Stoyanov and Xian Li. 2022. Few-shot Learning with Multilingual Language Models. arxiv:https:\/\/arXiv.org\/abs\/2112.10668\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2112.10668"},{"key":"e_1_3_3_1_23_2","unstructured":"Edward Loper and Steven Bird. 2002. NLTK: The Natural Language Toolkit. arxiv:https:\/\/arXiv.org\/abs\/cs\/0205028\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/cs\/0205028"},{"key":"e_1_3_3_1_24_2","unstructured":"Ilya Loshchilov and Frank Hutter. 2018. Fixing Weight Decay Regularization in Adam."},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096424"},{"key":"e_1_3_3_1_26_2","unstructured":"Ron Mokady Amir Hertz and Amit\u00a0H. Bermano. 2021. ClipCap: CLIP Prefix for Image Captioning. arxiv:https:\/\/arXiv.org\/abs\/2111.09734\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2111.09734"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_3_1_29_2","unstructured":"Alec Radford Jong\u00a0Wook Kim Chris Hallacy Aditya Ramesh Gabriel Goh Sandhini Agarwal Girish Sastry Amanda Askell Pamela Mishkin Jack Clark Gretchen Krueger and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arxiv:https:\/\/arXiv.org\/abs\/2103.00020\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2103.00020"},{"key":"e_1_3_3_1_30_2","unstructured":"Alec Radford Jeff Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners. (2019)."},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.eacl-main.266"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.104"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00278"},{"key":"e_1_3_3_1_34_2","unstructured":"Ilya Tolstikhin Neil Houlsby Alexander Kolesnikov Lucas Beyer Xiaohua Zhai Thomas Unterthiner Jessica Yung Andreas Steiner Daniel Keysers Jakob Uszkoreit Mario Lucic and Alexey Dosovitskiy. 2021. MLP-Mixer: An all-MLP Architecture for Vision. arxiv:https:\/\/arXiv.org\/abs\/2105.01601\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2105.01601"},{"key":"e_1_3_3_1_35_2","volume-title":"Advances in Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141\u00a0ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems , I.\u00a0Guyon, U.\u00a0Von Luxburg, S.\u00a0Bengio, H.\u00a0Wallach, R.\u00a0Fergus, S.\u00a0Vishwanathan, and R.\u00a0Garnett (Eds.), Vol.\u00a030. Curran Associates, Inc., vaswani@google.com. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_3_1_36_2","unstructured":"Ramakrishna Vedantam C.\u00a0Lawrence Zitnick and Devi Parikh. 2015. CIDEr: Consensus-based Image Description Evaluation. arxiv:https:\/\/arXiv.org\/abs\/1411.5726\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/1411.5726"},{"key":"e_1_3_3_1_37_2","unstructured":"Zirui Wang Jiahui Yu Adams\u00a0Wei Yu Zihang Dai Yulia Tsvetkov and Yuan Cao. 2022. SimVLM: Simple Visual Language Model Pretraining with Weak Supervision. arxiv:https:\/\/arXiv.org\/abs\/2108.10904\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2108.10904"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"crossref","unstructured":"Zifeng Wang Zizhao Zhang Sayna Ebrahimi Ruoxi Sun Han Zhang Chen-Yu Lee Xiaoqi Ren Guolong Su Vincent Perot Jennifer Dy and Tomas Pfister. 2022. DualPrompt: Complementary Prompting for Rehearsal-free Continual Learning. arxiv:https:\/\/arXiv.org\/abs\/2204.04799\u00a0[cs.LG]","DOI":"10.1007\/978-3-031-19809-0_36"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"e_1_3_3_1_40_2","unstructured":"Hantao Yao Rui Zhang and Changsheng Xu. 2023. Visual-Language Prompt Tuning with Knowledge-guided Context Optimization. arxiv:https:\/\/arXiv.org\/abs\/2303.13283\u00a0[cs.CV]"},{"key":"e_1_3_3_1_41_2","unstructured":"Zequn Zeng Yan Xie Hao Zhang Chiyu Chen Zhengjue Wang and Bo Chen. 2024. MeaCap: Memory-Augmented Zero-shot Image Captioning. arxiv:https:\/\/arXiv.org\/abs\/2403.03715\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2403.03715"},{"key":"e_1_3_3_1_42_2","unstructured":"Susan Zhang Stephen Roller Naman Goyal Mikel Artetxe Moya Chen Shuohui Chen Christopher Dewan Mona Diab Xian Li Xi\u00a0Victoria Lin Todor Mihaylov Myle Ott Sam Shleifer Kurt Shuster Daniel Simig Punit\u00a0Singh Koura Anjali Sridhar Tianlu Wang and Luke Zettlemoyer. 2022. OPT: Open Pre-trained Transformer Language Models. arxiv:https:\/\/arXiv.org\/abs\/2205.01068\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2205.01068"}],"event":{"name":"MMAsia '25: ACM Multimedia Asia","location":"Kuala Lumpur Malaysia","acronym":"MMAsia '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 7th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3743093.3770987","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:11:39Z","timestamp":1765008699000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3743093.3770987"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":41,"alternative-id":["10.1145\/3743093.3770987","10.1145\/3743093"],"URL":"https:\/\/doi.org\/10.1145\/3743093.3770987","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]},"assertion":[{"value":"2025-12-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}