{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,27]],"date-time":"2026-02-27T17:34:53Z","timestamp":1772213693352,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/100012542","name":"Sichuan Province Science and Technology Support Program","doi-asserted-by":"publisher","award":["2023YFG0025"],"award-info":[{"award-number":["2023YFG0025"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/100012542","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62371325, 62071314"],"award-info":[{"award-number":["62371325, 62071314"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681531","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"4861-4870","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Towards Medical Vision-Language Contrastive Pre-training via Study-Oriented Semantic Exploration"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2165-245X","authenticated-orcid":false,"given":"Bo","family":"Liu","sequence":"first","affiliation":[{"name":"School of Computer Science, Sichuan University, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0972-7044","authenticated-orcid":false,"given":"Zexin","family":"Lu","sequence":"additional","affiliation":[{"name":"School of Computer Science, Sichuan University, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8843-8685","authenticated-orcid":false,"given":"Yan","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computer Science, Sichuan University, Chengdu, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Publicly available clinical BERT embeddings. arXiv preprint arXiv:1904.03323","author":"Alsentzer Emily","year":"2019","unstructured":"Emily Alsentzer, John R Murphy, Willie Boag, Wei-Hung Weng, Di Jin, Tristan Naumann, and Matthew McDermott. 2019. Publicly available clinical BERT embeddings. arXiv preprint arXiv:1904.03323 (2019)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Benedikt Boecking Naoto Usuyama Shruthi Bannur Daniel C Castro Anton Schwaighofer Stephanie Hyland Maria Wetscherek Tristan Naumann Aditya Nori Javier Alvarez-Valle et al. 2022. Making the most of text semantics to improve biomedical vision--language processing. arXiv preprint arXiv:2204.09817 (2022).","DOI":"10.1007\/978-3-031-20059-5_1"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_9"},{"key":"e_1_3_2_1_4_1","first-page":"9912","article-title":"Unsupervised learning of visual features by contrasting cluster assignments","volume":"33","author":"Caron Mathilde","year":"2020","unstructured":"Mathilde Caron, Ishan Misra, Julien Mairal, Priya Goyal, Piotr Bojanowski, and Armand Joulin. 2020. Unsupervised learning of visual features by contrasting cluster assignments. Advances in Neural Information Processing Systems 33 (2020), 9912--9924.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-59713-9_51"},{"key":"e_1_3_2_1_6_1","volume-title":"International Conference on Machine Learning. PMLR, 1597--1607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In International Conference on Machine Learning. PMLR, 1597--1607."},{"key":"e_1_3_2_1_7_1","volume-title":"Improved baselines with momentum contrastive learning. arXiv preprint arXiv:2003.04297","author":"Chen Xinlei","year":"2020","unstructured":"Xinlei Chen, Haoqi Fan, Ross Girshick, and Kaiming He. 2020. Improved baselines with momentum contrastive learning. arXiv preprint arXiv:2003.04297 (2020)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02139"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2023.103018"},{"key":"e_1_3_2_1_10_1","volume-title":"MCAD: Multi-modal Conditioned Adversarial Diffusion Model for High- Quality PET Image Reconstruction. arXiv preprint arXiv:2406.13150","author":"Cui Jiaqi","year":"2024","unstructured":"Jiaqi Cui, Xinyi Zeng, Pinxian Zeng, Bo Liu, Xi Wu, Jiliu Zhou, and Yan Wang. 2024. MCAD: Multi-modal Conditioned Adversarial Diffusion Model for High- Quality PET Image Reconstruction. arXiv preprint arXiv:2406.13150 (2024)."},{"key":"e_1_3_2_1_11_1","first-page":"2292","article-title":"Sinkhorn distances: Lightspeed computation of optimal transport","volume":"26","author":"Cuturi Marco","year":"2013","unstructured":"Marco Cuturi. 2013. Sinkhorn distances: Lightspeed computation of optimal transport. Advances in Neural Information Processing Systems 26 (2013), 2292-- 2300.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_12_1","volume-title":"Jamie Ryan Kiros, and Sanja Fidler","author":"Faghri Fartash","year":"2017","unstructured":"Fartash Faghri, David J Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2017. VSE: Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612 (2017)."},{"key":"e_1_3_2_1_13_1","volume-title":"CERT: Contrastive self-supervised learning for language understanding. arXiv preprint arXiv:2005.12766","author":"Fang Hongchao","year":"2020","unstructured":"Hongchao Fang, Sicheng Wang, Meng Zhou, Jiayuan Ding, and Pengtao Xie. 2020. CERT: Contrastive self-supervised learning for language understanding. arXiv preprint arXiv:2005.12766 (2020)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISBI.2019.8759236"},{"key":"e_1_3_2_1_15_1","volume-title":"SimCSE: Simple contrastive learning of sentence embeddings. arXiv preprint arXiv:2104.08821","author":"Gao Tianyu","year":"2021","unstructured":"Tianyu Gao, Xingcheng Yao, and Danqi Chen. 2021. SimCSE: Simple contrastive learning of sentence embeddings. arXiv preprint arXiv:2104.08821 (2021)."},{"key":"e_1_3_2_1_16_1","first-page":"21271","article-title":"Bootstrap your own latent-a new approach to self-supervised learning","volume":"33","author":"Grill Jean-Bastien","year":"2020","unstructured":"Jean-Bastien Grill, Florian Strub, Florent Altch\u00e9, Corentin Tallec, Pierre Richemond, Elena Buchatskaya, Carl Doersch, Bernardo Avila Pires, Zhaohan Guo, Mohammad Gheshlaghi Azar, et al. 2020. Bootstrap your own latent-a new approach to self-supervised learning. Advances in Neural Information Processing Systems 33 (2020), 21271--21284.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_19_1","volume-title":"Unsupervised multimodal representation learning across medical images and reports. arXiv preprint arXiv:1811.08615","author":"Harry Hsu Tzu-Ming","year":"2018","unstructured":"Tzu-Ming Harry Hsu, Wei-Hung Weng, Willie Boag, Matthew McDermott, and Peter Szolovits. 2018. Unsupervised multimodal representation learning across medical images and reports. arXiv preprint arXiv:1811.08615 (2018)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00391"},{"key":"e_1_3_2_1_21_1","unstructured":"Stephanie L Hyland Shruthi Bannur Kenza Bouzid Daniel C Castro Mercy Ranjit Anton Schwaighofer Fernando P\u00e9rez-Garc\u00eda Valentina Salvatelli Shaury Srivastav Anja Thieme et al. 2023. Maira-1: A specialised large multimodal model for radiology report generation. arXiv preprint arXiv:2311.13668 (2023)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.3301590"},{"key":"e_1_3_2_1_23_1","volume-title":"International Conference on Machine Learning. PMLR, 4904--4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and visionlanguage representation learning with noisy text supervision. In International Conference on Machine Learning. PMLR, 4904--4916."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41597-019-0322-0"},{"key":"e_1_3_2_1_25_1","volume-title":"Llava-med: Training a large language-and-vision assistant for biomedicine in one day. Advances in Neural Information Processing Systems 36","author":"Li Chunyuan","year":"2024","unstructured":"Chunyuan Li, Cliff Wong, Sheng Zhang, Naoto Usuyama, Haotian Liu, Jianwei Yang, Tristan Naumann, Hoifung Poon, and Jianfeng Gao. 2024. Llava-med: Training a large language-and-vision assistant for biomedicine in one day. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_1_26_1","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume":"34","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in Neural Information Processing Systems 34 (2021), 9694--9705.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_27_1","volume-title":"International Conference on Learning Representations.","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Pan Zhou, Caiming Xiong, and Steven Hoi. 2021. Prototypical contrastive learning of unsupervised representations. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMI.2023.3294980"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-87196-3_20"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISBI48211.2021.9434010"},{"key":"e_1_3_2_1_31_1","first-page":"23102","article-title":"COCO-LM: Correcting and contrasting text sequences for language model pretraining","volume":"34","author":"Meng Yu","year":"2021","unstructured":"Yu Meng, Chenyan Xiong, Payal Bajaj, Paul Bennett, Jiawei Han, Xia Song, et al. 2021. COCO-LM: Correcting and contrasting text sequences for language model pretraining. Advances in Neural Information Processing Systems 34 (2021), 23102--23114.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_32_1","volume-title":"Joint learning of localized representations from medical images and reports. arXiv preprint arXiv:2112.02889","author":"M\u00fcller Philip","year":"2021","unstructured":"Philip M\u00fcller, Georgios Kaissis, Congyu Zou, and Daniel R\u00fcckert. 2021. Joint learning of localized representations from medical images and reports. arXiv preprint arXiv:2112.02889 (2021)."},{"key":"e_1_3_2_1_33_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"George Shih Carol C Wu Safwan S Halabi Marc D Kohli Luciano M Prevedello Tessa S Cook Arjun Sharma Judith K Amorosa Veronica Arteaga Maya Galperin-Aizenberg et al. 2019. Augmenting the national institutes of health chest radiograph dataset with expert annotations of possible pneumonia. Radiology. Artificial Intelligence (2019).","DOI":"10.1148\/ryai.2019180041"},{"key":"e_1_3_2_1_36_1","unstructured":"Society for Imaging Informatics in Medicine (SIIM) 2019. SIIM-ACR Pneumothorax Segmentation. https:\/\/www.kaggle.com\/c\/siim-acr-pneumothoraxsegmentation."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01836"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-6504"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-00919-9_29"},{"key":"e_1_3_2_1_40_1","volume-title":"Representation learning with contrastive predictive coding. arXiv e-prints","author":"den Oord Aaron Van","year":"2018","unstructured":"Aaron Van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv e-prints (2018), arXiv--1807."},{"key":"e_1_3_2_1_41_1","first-page":"33536","article-title":"Multi-granularity cross-modal alignment for generalized medical visual representation learning","volume":"35","author":"Wang Fuying","year":"2022","unstructured":"Fuying Wang, Yuyin Zhou, Shujun Wang, Varut Vardhanabhuti, and Lequan Yu. 2022. Multi-granularity cross-modal alignment for generalized medical visual representation learning. Advances in Neural Information Processing Systems 35 (2022), 33536--33549.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_42_1","volume-title":"SNCSE: Contrastive Learning for Unsupervised Sentence Embedding with Soft Negative Samples. arXiv preprint arXiv:2201.05979","author":"Wang Hao","year":"2022","unstructured":"Hao Wang, Yangguang Li, Zhen Huang, Yong Dou, Lingpeng Kong, and Jing Shao. 2022. SNCSE: Contrastive Learning for Unsupervised Sentence Embedding with Soft Negative Samples. arXiv preprint arXiv:2201.05979 (2022)."},{"key":"e_1_3_2_1_43_1","volume-title":"International Conference on Machine Learning. PMLR, 9929--9939","author":"Phillip Isola TongzhouWang","year":"2020","unstructured":"TongzhouWang and Phillip Isola. 2020. Understanding contrastive representation learning through alignment and uniformity on the hypersphere. In International Conference on Machine Learning. PMLR, 9929--9939."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00943"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1670"},{"key":"e_1_3_2_1_46_1","volume-title":"CLEAR: Contrastive learning for sentence representation. arXiv preprint arXiv:2012.15466","author":"Wu Zhuofeng","year":"2020","unstructured":"Zhuofeng Wu, Sinong Wang, Jiatao Gu, Madian Khabsa, Fei Sun, and Hao Ma. 2020. CLEAR: Contrastive learning for sentence representation. arXiv preprint arXiv:2012.15466 (2020)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01522"},{"key":"e_1_3_2_1_48_1","volume-title":"ViLaM: A Vision-Language Model with Enhanced Visual Grounding and Generalization Capability. arXiv preprint arXiv:2311.12327","author":"Yang Xiaoyu","year":"2023","unstructured":"Xiaoyu Yang, Lijian Xu, Hongsheng Li, and Shaoting Zhang. 2023. ViLaM: A Vision-Language Model with Enhanced Visual Grounding and Generalization Capability. arXiv preprint arXiv:2311.12327 (2023)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00692"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413761"},{"key":"e_1_3_2_1_51_1","volume-title":"Contrastive learning of medical visual representations from paired images and text. arXiv preprint arXiv:2010.00747","author":"Zhang Yuhao","year":"2020","unstructured":"Yuhao Zhang, Hang Jiang, Yasuhide Miura, Christopher D Manning, and Curtis P Langlotz. 2020. Contrastive learning of medical visual representations from paired images and text. arXiv preprint arXiv:2010.00747 (2020)."},{"key":"e_1_3_2_1_52_1","volume-title":"MedRG: Medical Report Grounding with Multi-modal Large Language Model. arXiv preprint arXiv:2404.06798","author":"Zou Ke","year":"2024","unstructured":"Ke Zou, Yang Bai, Zhihao Chen, Yang Zhou, Yidi Chen, Kai Ren, MengWang, Xuedong Yuan, Xiaojing Shen, and Huazhu Fu. 2024. MedRG: Medical Report Grounding with Multi-modal Large Language Model. arXiv preprint arXiv:2404.06798 (2024)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681531","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681531","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:48Z","timestamp":1750294668000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681531"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":52,"alternative-id":["10.1145\/3664647.3681531","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681531","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}