{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:54:28Z","timestamp":1781538868614,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"the National Natural Science Foundation of China","award":["No. 62576085"],"award-info":[{"award-number":["No. 62576085"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810643","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"223-232","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CCRA: A Cross-modal Complementary Representation Alignment Framework for Bridging the Modality Gap"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-2005-6899","authenticated-orcid":false,"given":"Xingchen","family":"Han","sequence":"first","affiliation":[{"name":"Northeastern University, shenyang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3180-0218","authenticated-orcid":false,"given":"Ruihao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Northeastern University, shenyang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4239-3880","authenticated-orcid":false,"given":"Ruiting","family":"Li","sequence":"additional","affiliation":[{"name":"Northeastern University, shenyang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0913-8367","authenticated-orcid":false,"given":"Yingxin","family":"Pei","sequence":"additional","affiliation":[{"name":"Northeastern University, shenyang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7389-8396","authenticated-orcid":false,"given":"Jiaqi","family":"Wang","sequence":"additional","affiliation":[{"name":"Northeastern University, shenyang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9610-5790","authenticated-orcid":false,"given":"Zhe","family":"Ji","sequence":"additional","affiliation":[{"name":"Northeastern University, shenyang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6824-1191","authenticated-orcid":false,"given":"Feiliang","family":"Ren","sequence":"additional","affiliation":[{"name":"Northeastern University, shenyang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3098-0225","authenticated-orcid":false,"given":"Yongkang","family":"Liu","sequence":"additional","affiliation":[{"name":"Northeastern University, qinhuangdao, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01449"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00276"},{"key":"e_1_3_3_1_4_2","first-page":"933","volume-title":"International conference on machine learning","author":"Dauphin Yann\u00a0N","year":"2017","unstructured":"Yann\u00a0N Dauphin, Angela Fan, Michael Auli, and David Grangier. 2017. Language modeling with gated convolutional networks. In International conference on machine learning. PMLR, 933\u2013941."},{"key":"e_1_3_3_1_5_2","unstructured":"Yifan Du Zikang Liu Junyi Li and Wayne\u00a0Xin Zhao. 2022. A survey of vision-language pre-trained models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2202.10936 (2022)."},{"key":"e_1_3_3_1_6_2","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Eslami Sedigheh","year":"2025","unstructured":"Sedigheh Eslami and Gerard de Melo. 2025. Mitigate the gap: Improving cross-modal alignment in CLIP. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_3_1_7_2","unstructured":"Fartash Faghri David\u00a0J Fleet Jamie\u00a0Ryan Kiros and Sanja Fidler. 2017. Vse++: Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1707.05612 (2017)."},{"key":"e_1_3_3_1_8_2","unstructured":"Abrar Fahim Alex Murphy and Alona Fyshe. 2024. It\u2019s Not a Modality Gap: Characterizing and Addressing the Contrastive Gap. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.18570 (2024)."},{"key":"e_1_3_3_1_9_2","unstructured":"Alex Fang Albin\u00a0Madappally Jose Amit Jain Ludwig Schmidt Alexander Toshev and Vaishaal Shankar. 2023. Data filtering networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.17425 (2023)."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01067"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"crossref","unstructured":"Shashank Goel Hritik Bansal Sumit Bhatia Ryan Rossi Vishwa Vinay and Aditya Grover. 2022. Cyclip: Cyclic contrastive language-image pretraining. Advances in Neural Information Processing Systems 35 (2022) 6704\u20136719.","DOI":"10.52202\/068431-0486"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02519"},{"key":"e_1_3_3_1_13_2","unstructured":"Ahmet Iscen Mathilde Caron Alireza Fathi and Cordelia Schmid. 2023. Retrieval-enhanced contrastive vision-text models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.07196 (2023)."},{"key":"e_1_3_3_1_14_2","first-page":"4904","volume-title":"International conference on machine learning","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning. PMLR, 4904\u20134916."},{"key":"e_1_3_3_1_15_2","unstructured":"Xu Jia Bert De\u00a0Brabandere Tinne Tuytelaars and Luc\u00a0V Gool. 2016. Dynamic filter networks. Advances in neural information processing systems 29 (2016)."},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00740"},{"key":"e_1_3_3_1_17_2","unstructured":"Zaid Khan and Yun Fu. 2023. Contrastive alignment of vision to language through parameter-efficient transfer learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.11866 (2023)."},{"key":"e_1_3_3_1_18_2","first-page":"5583","volume-title":"International conference on machine learning","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In International conference on machine learning. PMLR, 5583\u20135594."},{"key":"e_1_3_3_1_19_2","unstructured":"Ryan Kiros Ruslan Salakhutdinov and Richard\u00a0S Zemel. 2014. Unifying visual-semantic embeddings with multimodal neural language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1411.2539 (2014)."},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"crossref","unstructured":"Janghyeon Lee Jongsuk Kim Hyounguk Shon Bumsoo Kim Seung\u00a0Hwan Kim Honglak Lee and Junmo Kim. 2022. Uniclip: Unified framework for contrastive language-image pre-training. Advances in Neural Information Processing Systems 35 (2022) 1008\u20131019.","DOI":"10.52202\/068431-0074"},{"key":"e_1_3_3_1_21_2","unstructured":"Meir\u00a0Yossef Levi and Guy Gilboa. 2024. The double-ellipsoid geometry of clip. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.14517 (2024)."},{"key":"e_1_3_3_1_22_2","first-page":"12888","volume-title":"International conference on machine learning","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888\u201312900."},{"key":"e_1_3_3_1_23_2","unstructured":"Junnan Li Ramprasaath Selvaraju Akhilesh Gotmare Shafiq Joty Caiming Xiong and Steven Chu\u00a0Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34 (2021) 9694\u20139705."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"crossref","unstructured":"Victor\u00a0Weixin Liang Yuhui Zhang Yongchan Kwon Serena Yeung and James\u00a0Y Zou. 2022. Mind the gap: Understanding the modality gap in multi-modal contrastive representation learning. Advances in Neural Information Processing Systems 35 (2022) 17612\u201317625.","DOI":"10.52202\/068431-1280"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong\u00a0Jae Lee. 2023. Visual instruction tuning. Advances in neural information processing systems 36 (2023) 34892\u201334916.","DOI":"10.52202\/075280-1516"},{"key":"e_1_3_3_1_28_2","unstructured":"Siqu Long Feiqi Cao Soyeon\u00a0Caren Han and Haiqin Yang. 2022. Vision-and-language pretrained models: A survey. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.07356 (2022)."},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Andrzej Ma\u0107kiewicz and Waldemar Ratajczak. 1993. Principal components analysis (PCA). Computers & Geosciences 19 3 (1993) 303\u2013342.","DOI":"10.1016\/0098-3004(93)90090-R"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"crossref","unstructured":"Changdae Oh Junhyuk So Hoyoon Byun YongTaek Lim Minchul Shin Jong-June Jeon and Kyungwoo Song. 2023. Geodesic multi-modal mixup for robust fine-tuning. Advances in Neural Information Processing Systems 36 (2023) 52326\u201352341.","DOI":"10.52202\/075280-2278"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01934"},{"key":"e_1_3_3_1_32_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."},{"key":"e_1_3_3_1_33_2","unstructured":"Aditya Ramesh Prafulla Dhariwal Alex Nichol Casey Chu and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.06125 1 2 (2022) 3."},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.5555\/1866696.1866717"},{"key":"e_1_3_3_1_35_2","unstructured":"Fran\u00e7ois Role S\u00e9bastien Meyer and Victor Amblard. 2025. Fill the Gap: Quantifying and Reducing the Modality Gap in Image-Text Representation Learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.03703 (2025)."},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_1_37_2","unstructured":"William Rudman Nate Gillman Taylor Rayne and Carsten Eickhoff. 2021. IsoScore: Measuring the uniformity of embedding space utilization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2108.07344 (2021)."},{"key":"e_1_3_3_1_38_2","unstructured":"Noam Shazeer. 2020. Glu variants improve transformer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2002.05202 (2020)."},{"key":"e_1_3_3_1_39_2","unstructured":"PY Shi M Welle M Bj\u00f8rkman and D Kragic. 2023. Understanding the Modality Gap in Clip. ICLR Stockholm Sweden (2023)."},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"Gaurav Shinde Anuradha Ravi Emon Dey Shadman Sakib Milind Rampure and Nirmalya Roy. 2025. A Survey on Efficient Vision-Language Models. Wiley Interdisciplinary Reviews: Data Mining and Knowledge Discovery 15 3 (2025) e70036.","DOI":"10.1002\/widm.70036"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2025-1664"},{"key":"e_1_3_3_1_42_2","unstructured":"Michael Tschannen Alexey Gritsenko Xiao Wang Muhammad\u00a0Ferjad Naeem Ibrahim Alabdulmohsin Nikhil Parthasarathy Talfan Evans Lucas Beyer Ye Xia Basil Mustafa et\u00a0al. 2025. Siglip 2: Multilingual vision-language encoders with improved semantic understanding localization and dense features. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.14786 (2025)."},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"crossref","unstructured":"Kirill Tyshchuk Polina Karpikova Andrew Spiridonov Anastasiia Prutianova Anton Razzhigaev and Alexander Panchenko. 2023. On isotropy of multimodal embeddings. Information 14 7 (2023) 392.","DOI":"10.3390\/info14070392"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539253"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"crossref","unstructured":"Fuying Wang Yuyin Zhou Shujun Wang Varut Vardhanabhuti and Lequan Yu. 2022. Multi-granularity cross-modal alignment for generalized medical visual representation learning. Advances in neural information processing systems 35 (2022) 33536\u201333549.","DOI":"10.52202\/068431-2430"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01381"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"crossref","unstructured":"Peng Wang Dagang Li Xuesi Hu Yongmei Wang and Youhua Zhang. 2025. CLIPMulti: Explore the performance of multimodal enhanced CLIP for zero-shot text classification. Computer Speech & Language 90 (2025) 101748.","DOI":"10.1016\/j.csl.2024.101748"},{"key":"e_1_3_3_1_48_2","first-page":"9929","volume-title":"International conference on machine learning","author":"Wang Tongzhou","year":"2020","unstructured":"Tongzhou Wang and Phillip Isola. 2020. Understanding contrastive representation learning through alignment and uniformity on the hypersphere. In International conference on machine learning. PMLR, 9929\u20139939."},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/3539597.3570481"},{"key":"e_1_3_3_1_50_2","unstructured":"Zhengbo Wang Jian Liang Ran He Nan Xu Zilei Wang and Tieniu Tan. 2023. Improving zero-shot generalization for clip with synthesized prompts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.07397 (2023)."},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02206"},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00677"},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00402"},{"key":"e_1_3_3_1_54_2","unstructured":"Can Yaras Siyi Chen Peng Wang and Qing Qu. 2024. Explaining and mitigating the modality gap in contrastive multimodal learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.07909 (2024)."},{"key":"e_1_3_3_1_55_2","unstructured":"Lingjie Yi Raphael Douady and Chao Chen. 2025. Decipher the Modality Gap in Multimodal Contrastive Learning: From Convergent Representations to Pairwise Alignment. arxiv:https:\/\/arXiv.org\/abs\/2510.03268\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2510.03268"},{"key":"e_1_3_3_1_56_2","unstructured":"Lingjie Yi Raphael Douady and Chao Chen. 2025. Decrypt Modality Gap in Multimodal Contrastive Learning: From Convergent Representation to Pair Alignment. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2510.03268 (2025)."},{"key":"e_1_3_3_1_57_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_5"},{"key":"e_1_3_3_1_58_2","unstructured":"Jiahui Yu Zirui Wang Vijay Vasudevan Legg Yeung Mojtaba Seyedhosseini and Yonghui Wu. 2022. Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2205.01917 (2022)."},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.1073"},{"key":"e_1_3_3_1_60_2","doi-asserted-by":"crossref","unstructured":"Jingyi Zhang Jiaxing Huang Sheng Jin and Shijian Lu. 2024. Vision-language models for vision tasks: A survey. IEEE transactions on pattern analysis and machine intelligence 46 8 (2024) 5625\u20135644.","DOI":"10.1109\/TPAMI.2024.3369699"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:00:58Z","timestamp":1781535658000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810643"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":59,"alternative-id":["10.1145\/3805622.3810643","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810643","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}