{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T13:08:40Z","timestamp":1765544920898,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No.62203314"],"award-info":[{"award-number":["No.62203314"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Sichuan Science and Technology Program","award":["2022YFG0261"],"award-info":[{"award-number":["2022YFG0261"]}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["No.2023SCU12003"],"award-info":[{"award-number":["No.2023SCU12003"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612326","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"5109-5119","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["CONICA: A Contrastive Image Captioning Framework with Robust Similarity Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0177-7252","authenticated-orcid":false,"given":"Lin","family":"Deng","sequence":"first","affiliation":[{"name":"Sichuan University, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8046-1904","authenticated-orcid":false,"given":"Yuzhong","family":"Zhong","sequence":"additional","affiliation":[{"name":"Sichuan University, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7076-9112","authenticated-orcid":false,"given":"Maoning","family":"Wang","sequence":"additional","affiliation":[{"name":"College of Electrical Engineering, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5491-1745","authenticated-orcid":false,"given":"Jianwei","family":"Zhang","sequence":"additional","affiliation":[{"name":"Sichuan University, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"SPICE: Semantic Propositional Image Caption Evaluation. In ECCV. 382--398.","author":"Anderson Peter","year":"2016","unstructured":"Peter Anderson, Basura Fernando, Mark Johnson, and Stephen Gould. 2016. SPICE: Semantic Propositional Image Caption Evaluation. In ECCV. 382--398."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Peter Anderson Xiaodong He Chris Buehler Damien Teney Mark Johnson Stephen Gould and Lei Zhang. 2018. Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering. In CVPR. 6077--6086.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_3_1","volume-title":"METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In ACL. 228--231.","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In ACL. 228--231."},{"key":"e_1_3_2_1_4_1","volume-title":"The Unreasonable Effectiveness of CLIP Features for Image Captioning:An Experimental Analysis. In CVPR workshops. 4662--4670","author":"Barraco Manuele","year":"2022","unstructured":"Manuele Barraco, Marcella Cornia, Silvia Cascianelli, Lorenzo Baraldi, and Rita Cucchiara. 2022. The Unreasonable Effectiveness of CLIP Features for Image Captioning:An Experimental Analysis. In CVPR workshops. 4662--4670."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Ali Furkan Biten Lluis Gomez and Dimosthenis Karatzas. 2022. Let there be a clock on the beach: Reducing Object Hallucination in Image Captioning. In WACV. 1381--1390.","DOI":"10.1109\/WACV51458.2022.00253"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Zhe Cao Tao Qin Tie-Yan Liu Ming-Feng Tsai and Hang Li. 2007. Learning to Rank: From Pairwise Approach to Listwise Approach.","DOI":"10.1145\/1273496.1273513"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Marcella Cornia Matteo Stefanini Lorenzo Baraldi and Rita Cucchiara. 2020. Meshed-Memory Transformer for Image Captioning. In CVPR. 10578--10587.","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"e_1_3_2_1_8_1","unstructured":"Bo Dai and Dahua Lin. 2017. Contrastive Learning for Image Captioning. In NeurIPS."},{"key":"e_1_3_2_1_9_1","unstructured":"Kaiming He Haoqi Fan Yuxin Wu Saining Xie and Ross Girshick. 2020. Momentum Contrast for Unsupervised Visual Representation Learning. In CVPR. 9730--9738."},{"key":"e_1_3_2_1_10_1","volume-title":"Image Captioning: Transforming Objects into Words. In NeurIPS.","author":"Herdade Simao","year":"2019","unstructured":"Simao Herdade, Armin Kappeler, Kofi Boakye, and Joao Soares. 2019. Image Captioning: Transforming Objects into Words. In NeurIPS."},{"key":"e_1_3_2_1_11_1","volume-title":"Ronan Le Bras, and Yejin Choi","author":"Hessel Jack","year":"2021","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan Le Bras, and Yejin Choi. 2021. CLIPScore: A Reference-free Evaluation Metric for Image Captioning. In EMNLP. 7514--7528."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Lun Huang Wenmin Wang Jie Chen and Xiao-Yong Wei. 2018. Attention on Attention for Image Captioning. In ICCV. 4634--4643.","DOI":"10.1109\/ICCV.2019.00473"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Wenhao Jiang Lin Ma Yu-Gang Jiang Wei Liu and Tong Zhang. 2018. Recurrent Fusion Network for Image Captioning. In ECCV. 499--515.","DOI":"10.1007\/978-3-030-01216-8_31"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Andrej Karpathy and Fei-Fei Li. 2015. Deep Visual-Semantic Alignments for Generating Image Descriptions. In CVPR. 3128--3137.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_15_1","volume-title":"HAAV: Hierarchical Aggregation of Augmented Views for Image Captioning. In CVPR. 11039--11049.","author":"Kuo Chia-Wen","year":"2023","unstructured":"Chia-Wen Kuo and Zsolt Kira. 2023. HAAV: Hierarchical Aggregation of Augmented Views for Image Captioning. In CVPR. 11039--11049."},{"key":"e_1_3_2_1_16_1","volume-title":"UMIC: An Unreferenced Metric for Image Captioning via Contrastive Learning. In ACL(short papers). 220--226.","author":"Lee Hwanhee","year":"2021","unstructured":"Hwanhee Lee, Seunghyun Yoon, Franck Dernoncourt, Trung Bui, and Kyomin Jung. 2021. UMIC: An Unreferenced Metric for Image Captioning via Contrastive Learning. In ACL(short papers). 220--226."},{"key":"e_1_3_2_1_17_1","volume-title":"UNIMO: Towards Unified-Modal Understanding and Generation via Cross-Modal Contrastive Learning. In ACL. 2592--2607.","author":"Li Wei","year":"2021","unstructured":"Wei Li, Can Gao, Guocheng Niu, Xinyan Xiao, Hao Liu, Jiachen Liu, Hua Wu, and Haifeng Wang. 2021. UNIMO: Towards Unified-Modal Understanding and Generation via Cross-Modal Contrastive Learning. In ACL. 2592--2607."},{"key":"e_1_3_2_1_18_1","volume-title":"Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks. In ECCV. 121--137.","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, Yejin Choi, and Jianfeng Gao. 2020. Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks. In ECCV. 121--137."},{"key":"e_1_3_2_1_19_1","unstructured":"Yehao Li Yingwei Pan Ting Yao and Tao Mei. 2022. Comprehending and Ordering Semantics for Image Captioning. In CVPR. 2585--2594."},{"key":"e_1_3_2_1_20_1","volume-title":"ACL Workshops.","author":"Lin Chin-Yew","year":"2003","unstructured":"Chin-Yew Lin. 2003. Rouge: A package for automatic evaluation of summaries. In ACL Workshops."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Tsung-Yi Lin Serge Belongie James Hays Pietro Perona Deva Rmanan Piotr Doll\u00e1r and C. Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In ECCV. 740--755.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Siqi Liu Zhenhai Zhu Ning Ye Sergio Guadarrama and Kevin Murphy. 2017. Improved Image Captioning via Policy Gradient optimization of SPIDEr. In ICCV. 873--881.","DOI":"10.1109\/ICCV.2017.100"},{"key":"e_1_3_2_1_23_1","volume-title":"CPTR: Full Transformer Network for Image Captioning. In arXiv preprint:2101.10804.","author":"Liu Wei","year":"2021","unstructured":"Wei Liu, Sihan Chen, Longteng Guo, Xinxin Zhu, and Jing Liu. 2021. CPTR: Full Transformer Network for Image Captioning. In arXiv preprint:2101.10804."},{"key":"e_1_3_2_1_24_1","unstructured":"Ruotian Luo. 2020. A Better Variant of Self-Critical Sequence Training. In arXiv preprint:2003.09971."},{"key":"e_1_3_2_1_25_1","volume-title":"Bermano","author":"Mokady Ron","year":"2021","unstructured":"Ron Mokady, Amir Hertz, and Amit H. Bermano. 2021. ClipCap: CLIP Prefix for Image Captioning. In arXiv preprint:2111.09734."},{"key":"e_1_3_2_1_26_1","volume-title":"GRIT: Faster and Better Image captioning Transformer Using Dual Visual Features. In ECCV. 167--184.","author":"Nguyen Van-Quang","year":"2022","unstructured":"Van-Quang Nguyen, Masanori Suganuma, and Takayuki Okatani. 2022. GRIT: Faster and Better Image captioning Transformer Using Dual Visual Features. In ECCV. 167--184."},{"key":"e_1_3_2_1_27_1","unstructured":"Yingwei Pan Ting Yao Yehao Li and Tao Mei. 2020. X-Linear Attention Networks for Image Captioning. In CVPR. 10971--10980."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Kishore Papineni Salim Roukos Todd Ward and Wei-Jing Zhu. 2002. BLEU: a method for automatic evaluation of machine translation. In ACL. 311--318.","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_1_29_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In ICML."},{"key":"e_1_3_2_1_30_1","unstructured":"Shaoqing Ren Kaiming He Ross Girshick and Jian Sun. 2015. Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. In NeurIPS."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Steven J. Rennie Etienne Marcheret Youssef Mroueh Jarret Ross and Vaibhava Goel. 2017. Self-critical Sequence Training for Image Captioning. In CVPR. 7008--7024.","DOI":"10.1109\/CVPR.2017.131"},{"key":"e_1_3_2_1_32_1","volume-title":"Kaylee Burns, Trevor Darrell, and Kate Saenko.","author":"Rohrbach Anna","year":"2018","unstructured":"Anna Rohrbach, Lisa Anne Hendricks, Kaylee Burns, Trevor Darrell, and Kate Saenko. 2018. Object Hallucination in Image Captioning., 4035--4045 pages."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Sara Sarto Manuele Barraco Marcella Cornia Lorenzo Baraldi and Rita Cucchiara. 2023. Positive-Augmented Contrastive Learning for Image and Video Captioning Evaluation. In CVPR. 6914--6924.","DOI":"10.1109\/CVPR52729.2023.00668"},{"key":"e_1_3_2_1_34_1","volume-title":"Conceptual Captions: A Cleaned, Hypernymed, Image Alt-text Dataset For Automatic Image Captioning. In ACL. 2556--2565.","author":"Sharma Piyush","year":"2018","unstructured":"Piyush Sharma, Nan Ding, Sebastian Goodman, and Radu Soricut. 2018. Conceptual Captions: A Cleaned, Hypernymed, Image Alt-text Dataset For Automatic Image Captioning. In ACL. 2556--2565."},{"key":"e_1_3_2_1_35_1","volume-title":"Hao Tan, Mohit Bansal, Anna Rohrbach, Kai-Wei Chang, Zhewei Yao, and Kurt Keutzer.","author":"Shen Sheng","year":"2022","unstructured":"Sheng Shen, Linen Harold Li, Hao Tan, Mohit Bansal, Anna Rohrbach, Kai-Wei Chang, Zhewei Yao, and Kurt Keutzer. 2022. How Much Can CLIP Benefit Visionand-Language Tasks. In ICLR."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Ruixiang Tang Mengnan Du Yuening Li Zirui Liu Na Zou and Xia Hu. 2021. Mitigating Gender Bias in Captioning Systems. In WWW. 633--645.","DOI":"10.1145\/3442381.3449950"},{"key":"e_1_3_2_1_37_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In NeurIPS."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Ramakrishna Vedantam C. Lawrence Zitnick and Devi Parikh. 2015. CIDEr: Consensus-based Image Description Evaluation. In CVPR. 4566--4575.","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"O. Vinyals A. Toshev S. Bengio and D. Erhan. 2015. Show and tell: A neural image caption generator. In CVPR. 3156--3164.","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_1_40_1","volume-title":"OFA: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework. In ICML.","author":"Wang Peng","year":"2023","unstructured":"Peng Wang, An Yang, Rui Men, Junyang Lin, Shuai Bai, Zhikang Li, Jianxin Ma, Chang Zhou, Jingren Zhou, and Hongxia Yang. 2023. OFA: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework. In ICML."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Yiyu Wang Jungang Xu and Yingfei Sun. 2022. End-to-End Transformer Based Model for Image Captioning. In AAAI.","DOI":"10.1609\/aaai.v36i3.20160"},{"key":"e_1_3_2_1_42_1","unstructured":"Kelvin Xu Jimmy Ba Ryan Kiros Kyunghyun Cho Aaron Courville Ruslan Salakhudinov Rich Zemel and Yoshua Bengio. 2015. Show Attend and Tell: Neural Image Caption Generation with Visual Attention. In ICML. 2048--2057."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Xu Yang Chongyang Gao Hanwang Zhang and Jianfei Cai. 2021. Auto-parsing network for image captioning and visual question answering. In ICCV. 2197--2207.","DOI":"10.1109\/ICCV48922.2021.00220"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Xu Yang Kaihua Tang Hanwang Zhang and Jianfei Cai. 2019. Auto-encoding scene graphs for image captioning. In CVPR. 10685--10694.","DOI":"10.1109\/CVPR.2019.01094"},{"key":"e_1_3_2_1_45_1","unstructured":"Peter Young Alice Lai Micah Hodosh and Julia Hockenmaier. [n.d.]. ([n. d.])."},{"key":"e_1_3_2_1_46_1","volume-title":"CoCa: Contrastive Captioners are Image-Text Foundation Models. Transactions on Machine Learning Research","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, ZiruiWang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022. CoCa: Contrastive Captioners are Image-Text Foundation Models. Transactions on Machine Learning Research (2022)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Pengpeng Zeng Jinkuan Zhu Jingkuan Song and Lianli Gao. 2022. Progressive Tree-Structured Prototype Network. In ACM MM. 5210--5218.","DOI":"10.1145\/3503161.3548024"},{"key":"e_1_3_2_1_48_1","volume-title":"Yejin Choi, and Jianfeng Gao.","author":"Zhang Pengchuan","year":"2021","unstructured":"Pengchuan Zhang, Xiujun Li, Xiaowei Hu, Jianwei Yang, LijuanWang Lei Zhang, Yejin Choi, and Jianfeng Gao. 2021. VinVL: Revisiting Visual Representations in Vision-Language Models. In CVPR. 5579--5588."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Xuying Zhang Xiaoshuai Sun Yunpeng Luo Jiayi Ji Yiyi Zhou Yongjian Wu Feiyue Huang and Rongrong Ji. 2021. RSTNet: Captioning with Adaptive Attention on Visual and Non-Visual Words. In CVPR. 15465--15474.","DOI":"10.1109\/CVPR46437.2021.01521"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612326","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612326","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:57:15Z","timestamp":1755820635000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612326"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":49,"alternative-id":["10.1145\/3581783.3612326","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612326","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}