{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T02:11:22Z","timestamp":1765505482176,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","funder":[{"name":"National Research Foundation of Korea (NRF)","award":["RS-2024-00414981"],"award-info":[{"award-number":["RS-2024-00414981"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,10]]},"DOI":"10.1145\/3746252.3760959","type":"proceedings-article","created":{"date-parts":[[2025,11,8]],"date-time":"2025-11-08T00:36:36Z","timestamp":1762562196000},"page":"5268-5272","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Sparse and Dense Retrievers Learn Better Together: Joint Sparse-Dense Optimization for Text-Image Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-5812-4746","authenticated-orcid":false,"given":"Jonghyun","family":"Song","sequence":"first","affiliation":[{"name":"Graduate School of Data Science, Seoul National University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1997-4135","authenticated-orcid":false,"given":"Youngjune","family":"Lee","sequence":"additional","affiliation":[{"name":"NAVER Corporation, Seongnam, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0944-209X","authenticated-orcid":false,"given":"Gyu-Hwung","family":"Cho","sequence":"additional","affiliation":[{"name":"NAVER Corporation, Seongnam, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2120-313X","authenticated-orcid":false,"given":"Ilhyeon","family":"Song","sequence":"additional","affiliation":[{"name":"NAVER Corporation, Seongnam, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7250-2654","authenticated-orcid":false,"given":"Saehun","family":"Kim","sequence":"additional","affiliation":[{"name":"NAVER Corporation, Seongnam, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9296-3403","authenticated-orcid":false,"given":"Yohan","family":"Jo","sequence":"additional","affiliation":[{"name":"Graduate School of Data Science, Seoul National University, Seoul, Republic of Korea"}]}],"member":"320","published-online":{"date-parts":[[2025,11,10]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00512"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657769"},{"volume-title":"Handbook of computational statistics: Concepts and methods","author":"B\u00fchlmann Peter","key":"e_1_3_2_1_3_1","unstructured":"Peter B\u00fchlmann. 2011. Bagging, boosting and ensemble methods. In Handbook of computational statistics: Concepts and methods. Springer, 985-1022."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.932"},{"key":"e_1_3_2_1_5_1","volume-title":"Bge m3-embedding: Multi-lingual, multi-functionality, multi-granularity text embeddings through self-knowledge distillation. arXiv preprint arXiv:2402.03216","author":"Chen Jianlv","year":"2024","unstructured":"Jianlv Chen, Shitao Xiao, Peitian Zhang, Kun Luo, Defu Lian, and Zheng Liu. 2024. Bge m3-embedding: Multi-lingual, multi-functionality, multi-granularity text embeddings through self-knowledge distillation. arXiv preprint arXiv:2402.03216 (2024)."},{"key":"e_1_3_2_1_6_1","volume-title":"Context-aware sentence\/passage term importance estimation for first stage retrieval. arXiv preprint arXiv:1910.10687","author":"Dai Zhuyun","year":"2019","unstructured":"Zhuyun Dai and Jamie Callan. 2019. Context-aware sentence\/passage term importance estimation for first stage retrieval. arXiv preprint arXiv:1910.10687 (2019)."},{"key":"e_1_3_2_1_7_1","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171-4186."},{"key":"e_1_3_2_1_8_1","volume-title":"Jamie Ryan Kiros, and Sanja Fidler","author":"Faghri Fartash","year":"2017","unstructured":"Fartash Faghri, David J Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2017. Vse: Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612 (2017)."},{"key":"e_1_3_2_1_9_1","volume-title":"SPLADE v2: Sparse lexical and expansion model for information retrieval. arXiv preprint arXiv:2109.10086","author":"Formal Thibault","year":"2021","unstructured":"Thibault Formal, Carlos Lassance, Benjamin Piwowarski, and St\u00e9phane Clinchant. 2021a. SPLADE v2: Sparse lexical and expansion model for information retrieval. arXiv preprint arXiv:2109.10086 (2021)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3463098"},{"key":"e_1_3_2_1_11_1","volume-title":"COIL: Revisit exact lexical match in information retrieval with contextualized inverted list. arXiv preprint arXiv:2104.07186","author":"Gao Luyu","year":"2021","unstructured":"Luyu Gao, Zhuyun Dai, and Jamie Callan. 2021. COIL: Revisit exact lexical match in information retrieval with contextualized inverted list. arXiv preprint arXiv:2104.07186 (2021)."},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 17980-17989","author":"Hu Xiaowei","year":"2022","unstructured":"Xiaowei Hu, Zhe Gan, Jianfeng Wang, Zhengyuan Yang, Zicheng Liu, Yumao Lu, and Lijuan Wang. 2022. Scaling up vision-language pre-training for image captioning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 17980-17989."},{"key":"e_1_3_2_1_13_1","volume-title":"2025 IEEE International Students' Conference on Electrical, Electronics and Computer Science (SCEECS). IEEE, 1-6.","author":"Jeshmol PJ","year":"2025","unstructured":"PJ Jeshmol and Binsu C Kovoor. 2025. A CLIP-based Video Question Answering framework with Explainable AI. In 2025 IEEE International Students' Conference on Electrical, Electronics and Computer Science (SCEECS). IEEE, 1-6."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00273"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_16_1","volume-title":"International conference on machine learning. PMLR, 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888-12900."},{"key":"e_1_3_2_1_17_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694-9705."},{"key":"e_1_3_2_1_18_1","first-page":"1141","article-title":"Unified lexical representation for interpretable visual-language alignment","volume":"37","author":"Li Yifan","year":"2024","unstructured":"Yifan Li, Yikai Wang, Yanwei Fu, Dongyu Ru, Zheng Zhang, and Tong He. 2024. Unified lexical representation for interpretable visual-language alignment. Advances in Neural Information Processing Systems, Vol. 37 (2024), 1141-1161.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_19_1","volume-title":"A few brief notes on deepimpact, coil, and a conceptual framework for information retrieval techniques. arXiv preprint arXiv:2106.14807","author":"Lin Jimmy","year":"2021","unstructured":"Jimmy Lin and Xueguang Ma. 2021. A few brief notes on deepimpact, coil, and a conceptual framework for information retrieval techniques. arXiv preprint arXiv:2106.14807 (2021)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_21_1","volume-title":"VisualSparta: An embarrassingly simple approach to large-scale text-to-image search with weighted bag-of-words. arXiv preprint arXiv:2101.00265","author":"Lu Xiaopeng","year":"2021","unstructured":"Xiaopeng Lu, Tiancheng Zhao, and Kyusong Lee. 2021. VisualSparta: An embarrassingly simple approach to large-scale text-to-image search with weighted bag-of-words. arXiv preprint arXiv:2101.00265 (2021)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01029"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01752"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3463030"},{"key":"e_1_3_2_1_25_1","volume-title":"Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734","author":"Mokady Ron","year":"2021","unstructured":"Ron Mokady, Amir Hertz, and Amit H Bermano. 2021. Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734 (2021)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-56060-6_29"},{"key":"e_1_3_2_1_27_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/2009916.2009992"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_30_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_31_1","volume-title":"Clip models are few-shot learners: Empirical studies on vqa and visual entailment. arXiv preprint arXiv:2203.07190","author":"Song Haoyu","year":"2022","unstructured":"Haoyu Song, Li Dong, Wei-Nan Zhang, Ting Liu, and Furu Wei. 2022. Clip models are few-shot learners: Empirical studies on vqa and visual entailment. arXiv preprint arXiv:2203.07190 (2022)."},{"key":"e_1_3_2_1_32_1","volume-title":"Cross-modal retrieval: a systematic review of methods and future directions. Proc","author":"Wang Tianshi","year":"2025","unstructured":"Tianshi Wang, Fengling Li, Lei Zhu, Jingjing Li, Zheng Zhang, and Heng Tao Shen. 2025. Cross-modal retrieval: a systematic review of methods and future directions. Proc. IEEE (2025)."},{"key":"e_1_3_2_1_33_1","volume-title":"Towards Visual Grounding: A Survey. arXiv preprint arXiv:2412.20206","author":"Xiao Linhui","year":"2024","unstructured":"Linhui Xiao, Xiaoshan Yang, Xiangyuan Lan, Yaowei Wang, and Changsheng Xu. 2024. Towards Visual Grounding: A Survey. arXiv preprint arXiv:2412.20206 (2024)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3321501"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3348297"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543873.3584627"}],"event":{"name":"CIKM '25: The 34th ACM International Conference on Information and Knowledge Management","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Seoul Republic of Korea","acronym":"CIKM '25"},"container-title":["Proceedings of the 34th ACM International Conference on Information and Knowledge Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746252.3760959","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T02:06:49Z","timestamp":1765505209000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746252.3760959"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,10]]},"references-count":36,"alternative-id":["10.1145\/3746252.3760959","10.1145\/3746252"],"URL":"https:\/\/doi.org\/10.1145\/3746252.3760959","relation":{},"subject":[],"published":{"date-parts":[[2025,11,10]]},"assertion":[{"value":"2025-11-10","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}