{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T03:22:07Z","timestamp":1777951327279,"version":"3.51.4"},"reference-count":60,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100016834","name":"Chongqing Municipal Health Commission","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100016834","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Expert Systems with Applications"],"published-print":{"date-parts":[[2026,9]]},"DOI":"10.1016\/j.eswa.2026.132577","type":"journal-article","created":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T15:13:25Z","timestamp":1777043605000},"page":"132577","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["CM-CGNS: Cross-modal clustering-guided negative sampling for self-supervised joint learning from medical images and reports"],"prefix":"10.1016","volume":"325","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4754-813X","authenticated-orcid":false,"given":"Libin","family":"Lan","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7958-3976","authenticated-orcid":false,"given":"Hongxing","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6706-5817","authenticated-orcid":false,"given":"Zunhui","family":"Xia","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0243-3949","authenticated-orcid":false,"given":"Juan","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8239-7176","authenticated-orcid":false,"given":"Xiaofei","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2829-6416","authenticated-orcid":false,"given":"Yongmei","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4870-1493","authenticated-orcid":false,"given":"Yudong","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1348-5305","authenticated-orcid":false,"given":"Xin","family":"Luo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.eswa.2026.132577_bib0001","series-title":"Proc. Workshop. Clini. Nat. Lang. Process.","first-page":"72","article-title":"Publicly available clinical BERT Embeddings","author":"Alsentzer","year":"2019"},{"key":"10.1016\/j.eswa.2026.132577_bib0002","series-title":"Proc. Int. Conf. Learn. Represent.","first-page":"1","article-title":"BEiT: BERT Pre-training of image transformers","author":"Bao","year":"2022"},{"key":"10.1016\/j.eswa.2026.132577_bib0003","series-title":"Proc. Int. Conf. Learn. Represent.","first-page":"1","article-title":"VICReg: Variance-invariance-covariance regularization for self-supervised learning","author":"Bardes","year":"2022"},{"key":"10.1016\/j.eswa.2026.132577_bib0004","series-title":"Proc. Int. Conf. Machin. Learn.","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","author":"Chen","year":"2020"},{"key":"10.1016\/j.eswa.2026.132577_bib0005","series-title":"Proc. Int. Conf. Learn. Represent.","first-page":"1","article-title":"Incremental false negative detection for contrastive learning","author":"Chen","year":"2022"},{"key":"10.1016\/j.eswa.2026.132577_bib0006","series-title":"Proc. Annu. Meet. Assoc. Comput Linguist.","first-page":"9494","article-title":"Fine-grained image-text alignment in medical imaging enables explainable cyclic image-report generation","author":"Chen","year":"2024"},{"key":"10.1016\/j.eswa.2026.132577_bib0007","unstructured":"Chen, X., Fan, H., Girshick, R., & He, K. (2020b). Improved baselines with momentum contrastive learning. https:\/\/arxiv.org\/pdf\/2003.04297."},{"key":"10.1016\/j.eswa.2026.132577_bib0008","series-title":"Proc. IEEE Int. Conf. Comput. Vision Pattern Recognit.","first-page":"15750","article-title":"Exploring simple siamese representation learning","author":"Chen","year":"2021"},{"key":"10.1016\/j.eswa.2026.132577_bib0009","series-title":"Proc. ACM Int. Conf. Multimed.","first-page":"5152","article-title":"Align, reason and learn: Enhancing medical vision-and-language pre-training with knowledge","author":"Chen","year":"2022"},{"key":"10.1016\/j.eswa.2026.132577_bib0010","series-title":"Proc. IEEE Int. Conf. Comput. Vision.","first-page":"21361","article-title":"PRIOR: Prototype Representation Joint Learning From Medical Images and Reports","author":"Cheng","year":"2023"},{"key":"10.1016\/j.eswa.2026.132577_bib0011","series-title":"Proc. Adv. Neural Inf. Proces. Syst.","first-page":"8765","article-title":"Debiased contrastive learning","volume":"33","author":"Chuang","year":"2020"},{"key":"10.1016\/j.eswa.2026.132577_bib0012","series-title":"Proc. IEEE Int. Conf. Comput. Vision Pattern Recognit.","first-page":"248","article-title":"ImageNet: A large-scale hierarchical image database","author":"Deng","year":"2009"},{"key":"10.1016\/j.eswa.2026.132577_bib0013","series-title":"Proc. Annu. Meet. Assoc. Comput Linguist.","first-page":"8713","article-title":"Clustering-aware negative sampling for unsupervised sentence representation","author":"Deng","year":"2023"},{"key":"10.1016\/j.eswa.2026.132577_bib0014","series-title":"Proc. Int. Conf. Learn. Represent.","first-page":"1","article-title":"An image is worth 16\u202f\u202f\u00d7\u202f\u202f16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2021"},{"key":"10.1016\/j.eswa.2026.132577_bib0015","doi-asserted-by":"crossref","first-page":"123","DOI":"10.1016\/j.comcom.2023.11.001","article-title":"MedGCN: An IoT-edge thrombus graph convolutional network for accurate prediction and prescription diagnosis of vascular occlusive diseases from unstructured clinical reports","volume":"214","author":"Gao","year":"2024","journal-title":"Computer Communications"},{"issue":"12","key":"10.1016\/j.eswa.2026.132577_bib0016","doi-asserted-by":"crossref","first-page":"9052","DOI":"10.1109\/TPAMI.2024.3415112","article-title":"A survey on self-supervised learning: Algorithms, applications, and future trends","volume":"46","author":"Gui","year":"2024","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.eswa.2026.132577_bib0017","article-title":"Dynamic feature fusion guiding and multimodal large language model refining for medical image report generation","author":"Han","year":"2025","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.eswa.2026.132577_bib0018","series-title":"Proc. IEEE Int. Conf. Comput. Vision Pattern Recognit.","first-page":"16000","article-title":"Masked autoencoders are scalable vision learners","author":"He","year":"2022"},{"key":"10.1016\/j.eswa.2026.132577_bib0019","series-title":"Proc. IEEE Int. Conf. Comput. Vision Pattern Recognit.","first-page":"9729","article-title":"Momentum contrast for unsupervised visual representation learning","author":"He","year":"2020"},{"key":"10.1016\/j.eswa.2026.132577_bib0020","series-title":"Proc. IEEE Int. Conf. Comput. Vision Pattern Recognit.","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.eswa.2026.132577_bib0021","unstructured":"Healthcare, J. (2020). Object-CXR-automatic detection of foreign objects on chest X-rays. https:\/\/github.com\/hlk-1135\/object-CXR?tab=readme-ov-file."},{"key":"10.1016\/j.eswa.2026.132577_bib0022","series-title":"Proc. IEEE Int. Conf. Comput. Vision.","first-page":"3942","article-title":"GLoRIA: A multimodal global-local representation learning framework for label-efficient medical image recognition","author":"Huang","year":"2021"},{"key":"10.1016\/j.eswa.2026.132577_bib0023","series-title":"Proc. AAAI Conf. Artif. Intell.","first-page":"590","article-title":"CheXpert: A large chest radiograph dataset with uncertainty labels and expert comparison","author":"Irvin","year":"2019"},{"issue":"1","key":"10.1016\/j.eswa.2026.132577_bib0024","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1038\/sdata.2016.35","article-title":"MIMIC-III, a freely accessible critical care database","volume":"3","author":"Johnson","year":"2016","journal-title":"Scientific Data"},{"key":"10.1016\/j.eswa.2026.132577_bib0025","unstructured":"Johnson, A. E. W. et al. (2019). MIMIC-CXR-JPG, a large publicly available database of labeled chest radiographs. https:\/\/arxiv.org\/abs\/1901.07042v5."},{"key":"10.1016\/j.eswa.2026.132577_bib0026","series-title":"Proc. Adv. Neural Inf. Proces. Syst.","first-page":"21798","article-title":"Hard negative mixing for contrastive learning","author":"Kalantidis","year":"2020"},{"key":"10.1016\/j.eswa.2026.132577_bib0027","article-title":"Prompting across perception and recognition: A unified CLIP-based visual-text prompt framework for zero-shot anomaly detection","author":"Lai","year":"2025","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.eswa.2026.132577_bib0028","series-title":"Proc. Adv. Neural Inf. Proces. Syst.","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","author":"Li","year":"2021"},{"key":"10.1016\/j.eswa.2026.132577_bib0029","series-title":"Proc. Int. Conf. Inf. Knowledge. Manage","first-page":"1308","article-title":"Harnessing the power of pre-trained vision-language models for efficient medical report generation","author":"Li","year":"2023"},{"key":"10.1016\/j.eswa.2026.132577_bib0030","series-title":"Proc. AAAI Conf. Artif. Intell.","first-page":"8547","article-title":"Contrastive clustering","author":"Li","year":"2021"},{"issue":"9","key":"10.1016\/j.eswa.2026.132577_bib0031","doi-asserted-by":"crossref","first-page":"2205","DOI":"10.1007\/s11263-022-01639-z","article-title":"Twin contrastive learning for online clustering","volume":"130","author":"Li","year":"2022","journal-title":"International Journal of Computer Vision"},{"issue":"5","key":"10.1016\/j.eswa.2026.132577_bib0032","doi-asserted-by":"crossref","first-page":"3587","DOI":"10.1109\/JBHI.2025.3528196","article-title":"MedFILIP: Medical fine-grained language-image pre-training","volume":"29","author":"Liang","year":"2025","journal-title":"IEEE Journal of Biomedical and Health Informatics"},{"key":"10.1016\/j.eswa.2026.132577_bib0033","series-title":"Proc. AAAI Conf. Artif. Intell.","first-page":"7603","article-title":"Deep graph clustering via dual correlation reduction","author":"Liu","year":"2022"},{"issue":"10","key":"10.1016\/j.eswa.2026.132577_bib0034","doi-asserted-by":"crossref","first-page":"13789","DOI":"10.1109\/TNNLS.2023.3271871","article-title":"Simple contrastive graph clustering","volume":"35","author":"Liu","year":"2024","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"},{"key":"10.1016\/j.eswa.2026.132577_bib0035","series-title":"Proc. IEEE Int. Conf. Comput. Vision.","first-page":"10012","article-title":"Swin transformer: Hierarchical vision transformer using shifted windows","author":"Liu","year":"2021"},{"key":"10.1016\/j.eswa.2026.132577_bib0036","series-title":"Proc. Eur. Conf. Comput. Vis.","first-page":"685","article-title":"Joint learning of localized representations from medical images and reports","author":"M\u00fcller","year":"2022"},{"key":"10.1016\/j.eswa.2026.132577_bib0037","series-title":"Proc. Adv. Neural Inf. Proces. Syst.","first-page":"2148","article-title":"Multi-view contrastive graph clustering","author":"Pan","year":"2021"},{"key":"10.1016\/j.eswa.2026.132577_bib0038","series-title":"Proc. IEEE Int. Conf. Comput. Vision Pattern Recognit.","first-page":"6967","article-title":"Filtering, distillation, and hard negatives for vision-language pre-training","author":"Radenovic","year":"2023"},{"key":"10.1016\/j.eswa.2026.132577_bib0039","series-title":"Proc. Int. Conf. Machin. Learn.","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.eswa.2026.132577_bib0040","series-title":"Proc. Int. Conf. Learn. Represent.","article-title":"Contrastive learning with hard negative samples","author":"Robinson","year":"2021"},{"issue":"1","key":"10.1016\/j.eswa.2026.132577_bib0041","article-title":"Augmenting the national institutes of health chest radiograph dataset with expert annotations of possible pneumonia","volume":"1","author":"Shih","year":"2019","journal-title":"Radiology: Artificial Intelligence"},{"key":"10.1016\/j.eswa.2026.132577_bib0042","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2023.121526","article-title":"Miter: Medical image\u2013text joint adaptive pretraining with multi-level contrastive learning","volume":"238","author":"Shu","year":"2024","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.eswa.2026.132577_bib0043","series-title":"Proc. IEEE Int. Conf. Comput. Vision Pattern Recognit.","first-page":"20730","article-title":"Self-supervised pre-training of swin transformers for 3D medical image analysis","author":"Tang","year":"2022"},{"issue":"86","key":"10.1016\/j.eswa.2026.132577_bib0044","first-page":"2579","article-title":"Visualizing data using t-SNE","volume":"9","author":"van der Maaten","year":"2008","journal-title":"Journal of Machine Learning Research"},{"key":"10.1016\/j.eswa.2026.132577_bib0045","unstructured":"van den, O. A., Li, Y., & Vinyals, O. (2018). Representation learning with contrastive predictive coding. https:\/\/arxiv.org\/pdf\/1807.03748."},{"key":"10.1016\/j.eswa.2026.132577_bib0046","series-title":"Proc. Adv. Neural Inf. Proces. Syst.","first-page":"33536","article-title":"Multi-granularity cross-modal alignment for generalized medical visual representation learning","author":"Wang","year":"2022"},{"key":"10.1016\/j.eswa.2026.132577_bib0047","series-title":"Proc. Int. Conf. Intell. Comput.","first-page":"419","article-title":"SNCSE: Contrastive learning for unsupervised sentence embedding with soft negative samples","author":"Wang","year":"2023"},{"issue":"1","key":"10.1016\/j.eswa.2026.132577_bib0048","article-title":"COVID-Net: A tailored deep convolutional neural network design for detection of COVID-19 cases from chest x-ray images","volume":"10","author":"Wang","year":"2020","journal-title":"Scientific Reports"},{"key":"10.1016\/j.eswa.2026.132577_bib0049","series-title":"Proc. Conf. Empir. Methods Nat. Lang. Process.","first-page":"3876","article-title":"MedCLIP: Contrastive learning from unpaired medical images and text","author":"Wang","year":"2022"},{"key":"10.1016\/j.eswa.2026.132577_bib0050","series-title":"Proc. IEEE Int. Conf. Comput. Vision Pattern Recognit.","first-page":"9653","article-title":"SimMIM: A simple framework for masked image modeling","author":"Xie","year":"2022"},{"key":"10.1016\/j.eswa.2026.132577_bib0051","unstructured":"Zawacki, A. et al. (2019). SIIM-ACR pneumothorax segmentation. Kagglehttps:\/\/kaggle.com\/competitions\/siim-acr-pneumothorax-segmentation."},{"key":"10.1016\/j.eswa.2026.132577_bib0052","series-title":"Proc. Int. Conf. Machin. Learn.","first-page":"12310","article-title":"Barlow twins: Self-supervised learning via redundancy reduction","author":"Zbontar","year":"2021"},{"issue":"1","key":"10.1016\/j.eswa.2026.132577_bib0053","doi-asserted-by":"crossref","first-page":"72","DOI":"10.1038\/s44172-023-00121-z","article-title":"A generalized dual-domain generative framework with hierarchical consistency for medical image reconstruction and synthesis","volume":"2","author":"Zhang","year":"2023","journal-title":"Communications Engineering"},{"key":"10.1016\/j.eswa.2026.132577_bib0054","series-title":"Proc. Mach. Learn. Res.","first-page":"2","article-title":"Contrastive learning of medical visual representations from paired images and text","author":"Zhang","year":"2022"},{"key":"10.1016\/j.eswa.2026.132577_bib0055","series-title":"Proc. AAAI Conf. Artif. Intell.","first-page":"11730","article-title":"Unsupervised sentence representation via contrastive learning with mixing negatives","author":"Zhang","year":"2022"},{"key":"10.1016\/j.eswa.2026.132577_bib0056","series-title":"Proc. IEEE Int. Conf. Comput. Vision Pattern Recognit.","first-page":"29744","article-title":"MedUnifier: Unifying vision-and-language pre-training on medical data with vision generation task using discrete visual representations","author":"Zhang","year":"2025"},{"key":"10.1016\/j.eswa.2026.132577_bib0057","series-title":"Proc. IEEE Int. Conf. Comput. Vision.","first-page":"9224","article-title":"Graph contrastive clustering","author":"Zhong","year":"2021"},{"key":"10.1016\/j.eswa.2026.132577_bib0058","series-title":"Proc. IEEE Int. Symp. Biomed. Imaging","first-page":"1","article-title":"Self pre-training with masked autoencoders for medical image classification and segmentation","author":"Zhou","year":"2023"},{"key":"10.1016\/j.eswa.2026.132577_bib0059","series-title":"Proc. AAAI Conf. Artif. Intell.","first-page":"13041","article-title":"Unified vision-language pre-training for image captioning and VQA","author":"Zhou","year":"2020"},{"key":"10.1016\/j.eswa.2026.132577_bib0060","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2025.126660","article-title":"MCG-Net: Medical chief complaint-guided multi-modal masked content pre-training for chest image classification","volume":"271","author":"Zou","year":"2025","journal-title":"Expert Systems with Applications"}],"container-title":["Expert Systems with Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417426014909?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417426014909?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T09:38:08Z","timestamp":1777887488000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0957417426014909"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,9]]},"references-count":60,"alternative-id":["S0957417426014909"],"URL":"https:\/\/doi.org\/10.1016\/j.eswa.2026.132577","relation":{},"ISSN":["0957-4174"],"issn-type":[{"value":"0957-4174","type":"print"}],"subject":[],"published":{"date-parts":[[2026,9]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"CM-CGNS: Cross-modal clustering-guided negative sampling for self-supervised joint learning from medical images and reports","name":"articletitle","label":"Article Title"},{"value":"Expert Systems with Applications","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.eswa.2026.132577","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"132577"}}