{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,5]],"date-time":"2025-11-05T18:41:42Z","timestamp":1762368102489,"version":"build-2065373602"},"reference-count":23,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,8,31]],"date-time":"2025-08-31T00:00:00Z","timestamp":1756598400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,8,31]],"date-time":"2025-08-31T00:00:00Z","timestamp":1756598400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,8,31]]},"DOI":"10.1109\/mlsp62443.2025.11204242","type":"proceedings-article","created":{"date-parts":[[2025,10,24]],"date-time":"2025-10-24T17:15:52Z","timestamp":1761326152000},"page":"1-6","source":"Crossref","is-referenced-by-count":0,"title":["Closing the Gap in Multimodal Medical Representation Alignment"],"prefix":"10.1109","author":[{"given":"Eleonora","family":"Grassucci","sequence":"first","affiliation":[{"name":"Sapienza University of Rome,Dept. of Information Engineering, Electronics and Telecommunications,Italy"}]},{"given":"Giordano","family":"Cicchetti","sequence":"additional","affiliation":[{"name":"Sapienza University of Rome,Dept. of Information Engineering, Electronics and Telecommunications,Italy"}]},{"given":"Danilo","family":"Comminiello","sequence":"additional","affiliation":[{"name":"Sapienza University of Rome,Dept. of Information Engineering, Electronics and Telecommunications,Italy"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.256"},{"key":"ref2","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International Conference on Machine Learning (ICML)","author":"Radford","year":"2021"},{"key":"ref3","article-title":"Mind the gap: Understanding the modality gap in multi-modal contrastive representation learning","author":"Liang","year":"2022","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref4","article-title":"Mitigate the gap: Investigating approaches for improving cross-modal alignment in clip","author":"Eslami","year":"2024","journal-title":"ArXiv preprint"},{"key":"ref5","article-title":"It\u2019s not a modality gap: Characterizing and addressing the contrastive gap","author":"Fahim","year":"2024","journal-title":"ArXiv preprint"},{"key":"ref6","article-title":"Lungren, Tristan Naumann, Sheng Wang, and Hoifung Poon, \u201cBiomedCLIP: a multimodal biomedical foundation model pretrained from fifteen million scientific image-text pairs","author":"Zhang","year":"2023","journal-title":"ArXiv preprint"},{"key":"ref7","article-title":"Towards a clinically accessible radiology foundation model: open-access and lightweight, with automated evaluation","author":"Manuel","year":"2024","journal-title":"ArXiv preprint"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72661-3_27"},{"key":"ref9","article-title":"Explaining and mitigating the modality gap in contrastive multimodal learning","author":"Yaras","year":"2024","journal-title":"ArXiv preprint"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"ref13","article-title":"LanguageBind: Extending video-language pretraining to nmodality by language-based semantic alignment","volume-title":"International Conference on Learning Representations (ICLR)","author":"Zhu","year":"2024"},{"key":"ref14","article-title":"VAST: A vision-audio-subtitletext omni-modality foundation model and dataset","author":"Chen","year":"2023","journal-title":"Neural Information Processing Systems (NeurIPS)"},{"key":"ref15","article-title":"Gramian multimodal representation learning and alignment","volume-title":"International Conference on Learning Representations (ICLR)","author":"Cicchetti","year":"2025"},{"key":"ref16","article-title":"Contrasting with symile: Simple modelagnostic representation learning for unlimited modalities","author":"Saporta","year":"2024","journal-title":"Neural Information Processing Systems"},{"key":"ref17","article-title":"Understanding contrastive learning via distributionally robust optimization","author":"Wu","year":"2023","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref18","article-title":"Towards understanding the modality gap in CLIP","volume-title":"ICLR 2023 Workshop on Multimodal Representation Learning: Perks and Pitfalls","author":"Shi","year":"2023"},{"key":"ref19","article-title":"Two effects, one trigger: On the modality gap, object bias, and information imbalance in contrastive vision-language models","volume-title":"International Conference on Learning Representations (ICLR)","author":"Schrodi","year":"2024"},{"key":"ref20","first-page":"9929","article-title":"Understanding contrastive representation learning through alignment and uniformity on the hypersphere","volume-title":"International Conference on Machine Learning. PMLR","author":"Wang","year":"2020"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01364-6_20"},{"key":"ref22","article-title":"EVA-CLIP: Improved training techniques for clip at scale","author":"Sun","year":"2023","journal-title":"ArXiv preprint"},{"volume-title":"VideoCoCa: Video-text modeling with zero-shot transfer from contrastive captioners","year":"2022","author":"Yan","key":"ref23"}],"event":{"name":"2025 IEEE 35th International Workshop on Machine Learning for Signal Processing (MLSP)","start":{"date-parts":[[2025,8,31]]},"location":"Istanbul, Turkiye","end":{"date-parts":[[2025,9,3]]}},"container-title":["2025 IEEE 35th International Workshop on Machine Learning for Signal Processing (MLSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11204201\/11204202\/11204242.pdf?arnumber=11204242","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,5]],"date-time":"2025-11-05T18:37:19Z","timestamp":1762367839000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11204242\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,31]]},"references-count":23,"URL":"https:\/\/doi.org\/10.1109\/mlsp62443.2025.11204242","relation":{},"subject":[],"published":{"date-parts":[[2025,8,31]]}}}