{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,20]],"date-time":"2025-09-20T06:59:12Z","timestamp":1758351552550,"version":"3.44.0"},"publisher-location":"Cham","reference-count":31,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783032051400"},{"type":"electronic","value":"9783032051417"}],"license":[{"start":{"date-parts":[[2025,9,20]],"date-time":"2025-09-20T00:00:00Z","timestamp":1758326400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,20]],"date-time":"2025-09-20T00:00:00Z","timestamp":1758326400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-05141-7_11","type":"book-chapter","created":{"date-parts":[[2025,9,19]],"date-time":"2025-09-19T08:18:52Z","timestamp":1758269932000},"page":"106-116","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Endo-CLIP: Progressive Self-supervised Pre-training on\u00a0Raw Colonoscopy Records"],"prefix":"10.1007","author":[{"given":"Yili","family":"He","sequence":"first","affiliation":[]},{"given":"Yan","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Peiyao","family":"Fu","sequence":"additional","affiliation":[]},{"given":"Ruijie","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Tianyi","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Zhihua","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Quanlin","family":"Li","sequence":"additional","affiliation":[]},{"given":"Pinghong","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Xian","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Shuo","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,9,20]]},"reference":[{"key":"11_CR1","doi-asserted-by":"crossref","unstructured":"Ahmad, O.F., , et\u00a0al.: Establishing key research questions for the implementation of artificial intelligence in colonoscopy: a modified delphi method. Endoscopy 53(09), 893\u2013901 (2021)","DOI":"10.1055\/a-1384-0485"},{"key":"11_CR2","doi-asserted-by":"crossref","unstructured":"Ali, S.: Where do we stand in ai for endoscopic image analysis? deciphering gaps and future directions. npj Digital Med. 5(1), 184 (2022)","DOI":"10.1038\/s41746-022-00733-3"},{"issue":"4","key":"11_CR3","doi-asserted-by":"publisher","first-page":"683","DOI":"10.1136\/gutjnl-2015-310912","volume":"66","author":"M Arnold","year":"2017","unstructured":"Arnold, M., Sierra, M.S., Laversanne, M., Soerjomataram, I., Jemal, A., Bray, F.: Global patterns and trends in colorectal cancer incidence and mortality. Gut 66(4), 683\u2013691 (2017)","journal-title":"Gut"},{"key":"11_CR4","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254 (2021)"},{"issue":"6","key":"11_CR5","doi-asserted-by":"publisher","first-page":"1085","DOI":"10.1007\/s11548-024-03091-5","volume":"19","author":"D Bati\u0107","year":"2024","unstructured":"Bati\u0107, D., Holm, F., \u00d6zsoy, E., Czempiel, T., Navab, N.: Endovit: pretraining vision transformers on a large collection of endoscopic images. Int. J. Comput. Assist. Radiol. Surg. 19(6), 1085\u20131091 (2024)","journal-title":"Int. J. Comput. Assist. Radiol. Surg."},{"key":"11_CR6","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF international Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"11_CR7","doi-asserted-by":"publisher","unstructured":"Chen, X., He, Y., Xue, C., Ge, R., Li, S., Yang, G.: Knowledge boosting: rethinking medical contrastive vision-language pre-training. In: International Conference on Medical Image Computing and Computer-Assisted Intervention. pp. 405\u2013415. Springer (2023). https:\/\/doi.org\/10.1007\/978-3-031-43907-0_39","DOI":"10.1007\/978-3-031-43907-0_39"},{"issue":"5","key":"11_CR8","doi-asserted-by":"publisher","first-page":"1481","DOI":"10.1038\/s41591-024-02959-y","volume":"30","author":"M Christensen","year":"2024","unstructured":"Christensen, M., Vukadinovic, M., Yuan, N., Ouyang, D.: Vision-language foundation model for echocardiogram interpretation. Nat. Med. 30(5), 1481\u20131488 (2024)","journal-title":"Nat. Med."},{"key":"11_CR9","doi-asserted-by":"publisher","DOI":"10.1016\/j.compbiomed.2023.107037","volume":"161","author":"Y Dai","year":"2023","unstructured":"Dai, Y., Liu, F., Chen, W., Liu, Y., Shi, L., Liu, S., Zhou, Y., et al.: Swin mae: masked autoencoders for small datasets. Comput. Biol. Med. 161, 107037 (2023)","journal-title":"Comput. Biol. Med."},{"key":"11_CR10","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"11_CR11","unstructured":"Fang, A., Jose, A.M., Jain, A., Schmidt, L., Toshev, A., Shankar, V.: Data filtering networks. arXiv preprint arXiv:2309.17425 (2023)"},{"key":"11_CR12","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"11_CR13","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"issue":"9","key":"11_CR14","doi-asserted-by":"publisher","first-page":"2307","DOI":"10.1038\/s41591-023-02504-3","volume":"29","author":"Z Huang","year":"2023","unstructured":"Huang, Z., Bianchi, F., Yuksekgonul, M., Montine, T.J., Zou, J.: A visual-language foundation model for pathology image analysis using medical twitter. Nat. Med. 29(9), 2307\u20132316 (2023)","journal-title":"Nat. Med."},{"issue":"1","key":"11_CR15","doi-asserted-by":"publisher","first-page":"76","DOI":"10.1053\/j.gastro.2019.08.058","volume":"158","author":"C Berre","year":"2020","unstructured":"Berre, C., et al.: Application of artificial intelligence to gastroenterology and hepatology. Gastroenterology 158(1), 76\u201394 (2020)","journal-title":"Gastroenterology"},{"key":"11_CR16","doi-asserted-by":"publisher","unstructured":"Lin, W., Zhao, Z., Zhang, X., Wu, C., Zhang, Y., Wang, Y., Xie, W.: Pmc-clip: contrastive language-image pre-training using biomedical documents. In: International Conference on Medical Image Computing and Computer-Assisted Intervention. pp. 525\u2013536. Springer (2023). https:\/\/doi.org\/10.1007\/978-3-031-43993-3_51","DOI":"10.1007\/978-3-031-43993-3_51"},{"key":"11_CR17","unstructured":"Van\u00a0der Maaten, L., Hinton, G.: Visualizing data using t-sne. J. Mach. Learn. Res. 9(11) (2008)"},{"key":"11_CR18","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"11_CR19","unstructured":"Oquab, M., et\u00a0al.: Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)"},{"key":"11_CR20","doi-asserted-by":"crossref","unstructured":"Pizzi, E., Roy, S.D., Ravindra, S.N., Goyal, P., Douze, M.: A self-supervised descriptor for image copy detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14532\u201314542 (2022)","DOI":"10.1109\/CVPR52688.2022.01413"},{"key":"11_CR21","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PmLR (2021)"},{"issue":"12","key":"11_CR22","doi-asserted-by":"publisher","first-page":"1399","DOI":"10.1038\/s41551-022-00936-9","volume":"6","author":"E Tiu","year":"2022","unstructured":"Tiu, E., Talius, E., Patel, P., Langlotz, C.P., Ng, A.Y., Rajpurkar, P.: Expert-level detection of pathologies from unannotated chest x-ray images via self-supervised learning. Nature Biomed. Eng. 6(12), 1399\u20131406 (2022)","journal-title":"Nature Biomed. Eng."},{"key":"11_CR23","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inform. Process. Syst. 30 (2017)"},{"key":"11_CR24","first-page":"33536","volume":"35","author":"F Wang","year":"2022","unstructured":"Wang, F., Zhou, Y., Wang, S., Vardhanabhuti, V., Yu, L.: Multi-granularity cross-modal alignment for generalized medical visual representation learning. Adv. Neural. Inf. Process. Syst. 35, 33536\u201333549 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"11_CR25","unstructured":"Wang, S., et\u00a0al.: Knowledge extraction and distillation from large-scale image-text colonoscopy records leveraging large language and vision models. arXiv preprint arXiv:2310.11173 (2023)"},{"key":"11_CR26","doi-asserted-by":"publisher","unstructured":"Wang, Z., Liu, C., Zhang, S., Dou, Q.: Foundation model for endoscopy video analysis via large-scale self-supervised pre-train. In: International Conference on Medical Image Computing and Computer-Assisted Intervention, pp. 101\u2013111. Springer (2023). https:\/\/doi.org\/10.1007\/978-3-031-43996-4_10","DOI":"10.1007\/978-3-031-43996-4_10"},{"key":"11_CR27","doi-asserted-by":"crossref","unstructured":"Wang, Z., Wu, Z., Agarwal, D., Sun, J.: Medclip: contrastive learning from unpaired medical images and text. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing. Conference on Empirical Methods in Natural Language Processing, vol.\u00a02022, p.\u00a03876 (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.256"},{"issue":"8015","key":"11_CR28","doi-asserted-by":"publisher","first-page":"181","DOI":"10.1038\/s41586-024-07441-w","volume":"630","author":"H Xu","year":"2024","unstructured":"Xu, H., et al.: A whole-slide foundation model for digital pathology from real-world data. Nature 630(8015), 181\u2013188 (2024)","journal-title":"Nature"},{"key":"11_CR29","unstructured":"Zhang, S., et\u00a0al.: Biomedclip: a multimodal biomedical foundation model pretrained from fifteen million scientific image-text pairs. arXiv preprint arXiv:2303.00915 (2023)"},{"key":"11_CR30","unstructured":"Zhang, Y., Jiang, H., Miura, Y., Manning, C.D., Langlotz, C.P.: Contrastive learning of medical visual representations from paired images and text. In: Machine learning for Healthcare Conference. pp. 2\u201325. PMLR (2022)"},{"issue":"7981","key":"11_CR31","doi-asserted-by":"publisher","first-page":"156","DOI":"10.1038\/s41586-023-06555-x","volume":"622","author":"Y Zhou","year":"2023","unstructured":"Zhou, Y., et al.: A foundation model for generalizable disease detection from retinal images. Nature 622(7981), 156\u2013163 (2023)","journal-title":"Nature"}],"container-title":["Lecture Notes in Computer Science","Medical Image Computing and Computer Assisted Intervention \u2013 MICCAI 2025"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-05141-7_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,19]],"date-time":"2025-09-19T08:19:02Z","timestamp":1758269942000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-05141-7_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,20]]},"ISBN":["9783032051400","9783032051417"],"references-count":31,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-05141-7_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025,9,20]]},"assertion":[{"value":"20 September 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"MICCAI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Medical Image Computing and Computer-Assisted Intervention","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Daejeon","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Korea (Republic of)","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"miccai2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/conferences.miccai.org\/2025\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}