{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,17]],"date-time":"2025-05-17T04:01:51Z","timestamp":1747454511331,"version":"3.40.5"},"publisher-location":"Cham","reference-count":94,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031889769","type":"print"},{"value":"9783031889776","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-88977-6_8","type":"book-chapter","created":{"date-parts":[[2025,5,16]],"date-time":"2025-05-16T05:22:40Z","timestamp":1747372960000},"page":"72-88","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Video Foundation Model for\u00a0Medical 3D Segmentation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-4028-6917","authenticated-orcid":false,"given":"Qi","family":"Ma","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8667-9656","authenticated-orcid":false,"given":"Guolei","family":"Sun","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7852-0696","authenticated-orcid":false,"given":"G\u00fcney Ik","family":"Tombak","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1833-3416","authenticated-orcid":false,"given":"Shipra","family":"Jain","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2089-0150","authenticated-orcid":false,"given":"Niko Benjamin","family":"Huber","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3445-5711","authenticated-orcid":false,"given":"Luc Van","family":"Gool","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2542-3611","authenticated-orcid":false,"given":"Ender","family":"Konukoglu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,17]]},"reference":[{"issue":"1","key":"8_CR1","doi-asserted-by":"publisher","first-page":"4128","DOI":"10.1038\/s41467-022-30695-9","volume":"13","author":"M Antonelli","year":"2022","unstructured":"Antonelli, M., et al.: The medical segmentation decathlon. Nat. Commun. 13(1), 4128 (2022). https:\/\/doi.org\/10.1038\/s41467-022-30695-9","journal-title":"Nat. Commun."},{"key":"8_CR2","unstructured":"Azad, B., et al.: Foundational models in medical imaging: a comprehensive survey and future vision (2023). https:\/\/arxiv.org\/abs\/2310.18689"},{"key":"8_CR3","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: Beit: Bert pre-training of image transformers. arXiv preprint (2021). arXiv:2106.08254"},{"key":"8_CR4","unstructured":"Ben-Hamadou, A., et al.: Teeth3ds: a benchmark for teeth segmentation and labeling from intra-oral 3d scans. arXiv preprint (2022). arXiv:2210.06094"},{"key":"8_CR5","unstructured":"Ben-Hamadou, A., et\u00a0al.: 3dteethseg\u201922: 3d teeth scan segmentation and labeling challenge. arXiv preprint (2023). arXiv:2305.18277"},{"key":"8_CR6","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? In: ICML, vol. 2, p. 4 (2021)"},{"key":"8_CR7","unstructured":"Bolelli, F., et al.: Toothfairy2: multi-structure segmentation in CBCT volumes. In: Proceedings of the MICCAI Multi-Atlas Labeling Beyond Cranial Vault-Workshop Challenge, University of Modena and Reggio Emilia, Italy (2024)"},{"key":"8_CR8","unstructured":"Bommasani, R., et\u00a0al.: On the opportunities and risks of foundation models. arXiv preprint (2021). arXiv:2108.07258"},{"key":"8_CR9","unstructured":"Buess, L., et al.: Video-CT MAE: Self-supervised video-CT domain adaptation for vertebral fracture diagnosis. In: Medical Imaging with Deep Learning (2024)"},{"key":"8_CR10","unstructured":"Cao, S., Xu, P., Clifton, D.A.: How to understand masked autoencoders. arXiv preprint (2022). arXiv:2202.03670"},{"key":"8_CR11","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: European conference on computer vision, pp. 213\u2013229. Springer, Cham (2020)","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"8_CR12","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers (2021). https:\/\/arxiv.org\/abs\/2104.14294","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"8_CR13","unstructured":"Chaitanya, K., Erdil, E., Karani, N., Konukoglu, E.: Contrastive learning of global and local features for medical image segmentation with limited annotations (2020). https:\/\/arxiv.org\/abs\/2006.10511"},{"key":"8_CR14","doi-asserted-by":"publisher","first-page":"102792","DOI":"10.1016\/j.media.2023.102792","volume":"87","author":"K Chaitanya","year":"2023","unstructured":"Chaitanya, K., Erdil, E., Karani, N., Konukoglu, E.: Local contrastive loss with pseudo-label based self-training for semi-supervised medical image segmentation. Med. Image Anal. 87, 102792 (2023)","journal-title":"Med. Image Anal."},{"key":"8_CR15","doi-asserted-by":"crossref","unstructured":"Chartsias, A., Joyce, T., Dharmakumar, R., Tsaftaris, S.A.: Adversarial image synthesis for unpaired multi-modal cardiac data. In: Simulation and Synthesis in Medical Imaging: Second International Workshop, SASHIMI 2017, Held in Conjunction with MICCAI 2017, Qu\u00e9bec City, 10 September 2017, Proceedings 2, pp. 3\u201313. Springer (2017)","DOI":"10.1007\/978-3-319-68127-6_1"},{"key":"8_CR16","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"key":"8_CR17","doi-asserted-by":"crossref","unstructured":"Chen, X., He, K.: Exploring simple siamese representation learning (2020). https:\/\/arxiv.org\/abs\/2011.10566","DOI":"10.1109\/CVPR46437.2021.01549"},{"key":"8_CR18","doi-asserted-by":"publisher","first-page":"11500","DOI":"10.1109\/ACCESS.2022.3144840","volume":"10","author":"M Cipriano","year":"2022","unstructured":"Cipriano, M., et al.: Deep segmentation of the mandibular canal: a new 3D annotated dataset of CBCT volumes. IEEE Access 10, 11500\u201311510 (2022)","journal-title":"IEEE Access"},{"key":"8_CR19","doi-asserted-by":"crossref","unstructured":"Cipriano, M., Allegretti, S., Bolelli, F., Pollastri, F., Grana, C.: Improving segmentation of the inferior alveolar nerve through deep label propagation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21137\u201321146 (2022)","DOI":"10.1109\/CVPR52688.2022.02046"},{"key":"8_CR20","doi-asserted-by":"crossref","unstructured":"Cui, W., et\u00a0al.: Ctooth+: a large-scale dental cone beam computed tomography dataset and benchmark for tooth volume segmentation. In: MICCAI Workshop on Data Augmentation, Labelling, and Imperfections, pp. 64\u201373. Springer, Cham (2022)","DOI":"10.1007\/978-3-031-17027-0_7"},{"key":"8_CR21","doi-asserted-by":"crossref","unstructured":"Cui, W., et al.: Ctooth: a fully annotated 3D dataset and benchmark for tooth volume segmentation on cone beam computed tomography images. In: International Conference on Intelligent Robotics and Applications, pp. 191\u2013200. Springer, Cham (2022)","DOI":"10.1007\/978-3-031-13841-6_18"},{"key":"8_CR22","unstructured":"D\u2019Antonoli, T.A., et al.: Totalsegmentator MRI: sequence-independent segmentation of 59 anatomical structures in MR images (2024). https:\/\/arxiv.org\/abs\/2405.19492"},{"key":"8_CR23","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"8_CR24","doi-asserted-by":"publisher","first-page":"103207","DOI":"10.1016\/j.media.2024.103207","volume":"95","author":"A Diaz-Pinto","year":"2024","unstructured":"Diaz-Pinto, A., et al.: Monai label: a framework for AI-assisted interactive labeling of 3D medical images. Med. Image Anal. 95, 103207 (2024). https:\/\/doi.org\/10.1016\/j.media.2024.103207","journal-title":"Med. Image Anal."},{"key":"8_CR25","unstructured":"Dosovitskiy, A., et al.: An image is worth 16 x 16 words: transformers for image recognition at scale (2021). https:\/\/arxiv.org\/abs\/2010.11929"},{"key":"8_CR26","doi-asserted-by":"crossref","unstructured":"Dou, Q., Ouyang, C., Chen, C., Chen, H., Heng, P.A.: Unsupervised cross-modality domain adaptation of convnets for biomedical image segmentations with adversarial loss. arXiv preprint (2018). arXiv:1804.10916","DOI":"10.24963\/ijcai.2018\/96"},{"key":"8_CR27","unstructured":"Du, Y., Bai, F., Huang, T., Zhao, B.: Segvol: universal and interactive volumetric medical image segmentation. arXiv preprint (2023). arXiv:2311.13385"},{"key":"8_CR28","unstructured":"Eslami, S., de\u00a0Melo, G., Meinel, C.: Does clip benefit visual question answering in the medical domain as much as it does in the general domain? (2021). https:\/\/arxiv.org\/abs\/2112.13906"},{"key":"8_CR29","unstructured":"Feichtenhofer, C., Li, Y., He, K., et al.: Masked autoencoders as spatiotemporal learners. In: Advance in Neural Information Processing System, vol. 35, pp. 35946\u201335958 (2022)"},{"key":"8_CR30","unstructured":"Gao, Y., Zhou, M., Liu, D., Yan, Z., Zhang, S., Metaxas, D.N.: A data-scalable transformer for medical image segmentation: architecture, model efficiency, and benchmark (2023). https:\/\/arxiv.org\/abs\/2203.00131"},{"key":"8_CR31","unstructured":"Gu, A., Dao, T.: Mamba: linear-time sequence modeling with selective state spaces. arXiv preprint (2023). arXiv:2312.00752"},{"issue":"3","key":"8_CR32","doi-asserted-by":"publisher","first-page":"1173","DOI":"10.1109\/tbme.2021.3117407","volume":"69","author":"H Guan","year":"2022","unstructured":"Guan, H., Liu, M.: Domain adaptation for medical image analysis: a survey. IEEE Trans. Biomed. Eng. 69(3), 1173\u20131185 (2022). https:\/\/doi.org\/10.1109\/tbme.2021.3117407","journal-title":"IEEE Trans. Biomed. Eng."},{"key":"8_CR33","doi-asserted-by":"crossref","unstructured":"Hatamizadeh, A., et al.: Unetr: transformers for 3D medical image segmentation (2021). https:\/\/arxiv.org\/abs\/2103.10504","DOI":"10.1109\/WACV51458.2022.00181"},{"key":"8_CR34","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"8_CR35","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning (2020). https:\/\/arxiv.org\/abs\/1911.05722","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"8_CR36","doi-asserted-by":"crossref","unstructured":"He, Y., Nath, V., Yang, D., Tang, Y., Myronenko, A., Xu, D.: Swinunetr-v2: stronger swin transformers with stagewise convolutions for 3D medical image segmentation. In: International Conference on Medical Image Computing and Computer-Assisted Intervention, pp. 416\u2013426. Springer, Cham (2023)","DOI":"10.1007\/978-3-031-43901-8_40"},{"key":"8_CR37","unstructured":"Heller, N., et al.: The kits21 challenge: automatic segmentation of kidneys, renal tumors, and renal cysts in corticomedullary-phase CT (2023). https:\/\/arxiv.org\/abs\/2307.01984"},{"key":"8_CR38","unstructured":"Hu, E.J., et al.: Lora: low-rank adaptation of large language models. arXiv preprint (2021). arXiv:2106.09685"},{"key":"8_CR39","unstructured":"Huang, Z., et al.: Stu-net: scalable and transferable medical image segmentation models empowered by large-scale supervised pre-training (2023). https:\/\/arxiv.org\/abs\/2304.06716"},{"issue":"2","key":"8_CR40","doi-asserted-by":"publisher","first-page":"203","DOI":"10.1038\/s41592-020-01008-z","volume":"18","author":"F Isensee","year":"2021","unstructured":"Isensee, F., Jaeger, P.F., Kohl, S.A., Petersen, J., Maier-Hein, K.H.: nnU-Net: a self-configuring method for deep learning-based biomedical image segmentation. Nat. Methods 18(2), 203\u2013211 (2021)","journal-title":"Nat. Methods"},{"key":"8_CR41","doi-asserted-by":"crossref","unstructured":"Isensee, F., Maier-Hein, K.H.: An attempt at beating the 3D U-net (2019). https:\/\/arxiv.org\/abs\/1908.02182","DOI":"10.24926\/548719.001"},{"key":"8_CR42","doi-asserted-by":"crossref","unstructured":"Isensee, F., et al.: nnU-Net revisited: a call for rigorous validation in 3D medical image segmentation (2024). https:\/\/arxiv.org\/abs\/2404.09556","DOI":"10.1007\/978-3-031-72114-4_47"},{"key":"8_CR43","unstructured":"Ji, Y., et\u00a0al.: Amos: a large-scale abdominal multi-organ benchmark for versatile medical image segmentation. arXiv preprint (2022). arXiv:2206.08023"},{"key":"8_CR44","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision (2021). https:\/\/arxiv.org\/abs\/2102.05918"},{"key":"8_CR45","unstructured":"Kay, W., et al.: The kinetics human action video dataset (2017). https:\/\/arxiv.org\/abs\/1705.06950"},{"key":"8_CR46","unstructured":"Landman, B., Xu, Z., Iglesias, J.E., Styner, M., et\u00a0al.: 2015 MICCAI multi-atlas labeling beyond the cranial vault workshop and challenge. In: Proceedings of the MICCAI Multi-Atlas Labeling Beyond Cranial Vault-Workshop Challenge (2015)"},{"key":"8_CR47","unstructured":"Li, B., Weinberger, K.Q., Belongie, S., Koltun, V., Ranftl, R.: Language-driven semantic segmentation (2022). https:\/\/arxiv.org\/abs\/2201.03546"},{"key":"8_CR48","doi-asserted-by":"crossref","unstructured":"Li, K., et al.: Videomamba: state space model for efficient video understanding (2024). https:\/\/arxiv.org\/abs\/2403.06977","DOI":"10.1007\/978-3-031-73347-5_14"},{"key":"8_CR49","doi-asserted-by":"publisher","first-page":"103285","DOI":"10.1016\/j.media.2024.103285","volume":"97","author":"W Li","year":"2024","unstructured":"Li, W., et al.: Abdomenatlas: a large-scale, detailed-annotated, & multi-center dataset for efficient transfer learning and open algorithmic benchmarking. Med. Image Anal. 97, 103285 (2024)","journal-title":"Med. Image Anal."},{"key":"8_CR50","doi-asserted-by":"crossref","unstructured":"Li, Y., Mao, H., Girshick, R., He, K.: Exploring plain vision transformer backbones for object detection (2022). https:\/\/arxiv.org\/abs\/2203.16527","DOI":"10.1007\/978-3-031-20077-9_17"},{"key":"8_CR51","unstructured":"Li, Y., Zhang, K., Cao, J., Timofte, R., Gool, L.V.: Localvit: bringing locality to vision transformers (2021). https:\/\/arxiv.org\/abs\/2104.05707"},{"key":"8_CR52","doi-asserted-by":"publisher","unstructured":"Liu, J., et al.: Clip-driven universal model for organ segmentation and tumor detection. In: 2023 IEEE\/CVF International Conference on Computer Vision (ICCV). IEEE (2023). https:\/\/doi.org\/10.1109\/iccv51070.2023.01934","DOI":"10.1109\/iccv51070.2023.01934"},{"key":"8_CR53","unstructured":"Liu, Y., Sun, G., Qiu, Y., Zhang, L., Chhatkuli, A., Van\u00a0Gool, L.: Transformer in convolutional neural networks. 3 arXiv preprint (2021). arxiv:2106.03180"},{"key":"8_CR54","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows (2021). https:\/\/arxiv.org\/abs\/2103.14030","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"8_CR55","unstructured":"Liu, Z., et al.: Video swin transformer (2021). https:\/\/arxiv.org\/abs\/2106.13230"},{"key":"8_CR56","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization (2019). https:\/\/arxiv.org\/abs\/1711.05101"},{"key":"8_CR57","doi-asserted-by":"crossref","unstructured":"Lumetti, L., Pipoli, V., Bolelli, F., Ficarra, E., Grana, C.: Enhancing patch-based learning for the segmentation of the mandibular canal. IEEE Access (2024)","DOI":"10.1109\/ACCESS.2024.3408629"},{"key":"8_CR58","unstructured":"Ma, J., Li, F., Wang, B.: U-mamba: enhancing long-range dependency for biomedical image segmentation. arXiv preprint (2024). arXiv:2401.04722"},{"key":"8_CR59","unstructured":"Ma, Q., et al.: Shapesplat: a large-scale dataset of gaussian splats and their self-supervised pretraining (2024). https:\/\/arxiv.org\/abs\/2408.10906"},{"key":"8_CR60","doi-asserted-by":"crossref","unstructured":"Ma, Q., Zhang, J., Qi, L., Yu, Q., Shi, Y., Gao, Y.: Constructing and exploring intermediate domains in mixed domain semi-supervised medical image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11642\u201311651 (2024)","DOI":"10.1109\/CVPR52733.2024.01106"},{"issue":"6","key":"8_CR61","doi-asserted-by":"publisher","first-page":"e314","DOI":"10.1016\/S2589-7500(20)30085-6","volume":"2","author":"M Monteiro","year":"2020","unstructured":"Monteiro, M., et al.: Multiclass semantic segmentation and quantification of traumatic brain injury lesions on head CT using deep learning: an algorithm development and multicentre validation study. Lancet Digit. Health 2(6), e314\u2013e322 (2020)","journal-title":"Lancet Digit. Health"},{"key":"8_CR62","unstructured":"Oquab, M., et al.: Dinov2: learning robust visual features without supervision (2024). https:\/\/arxiv.org\/abs\/2304.07193"},{"key":"8_CR63","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision (2021). https:\/\/arxiv.org\/abs\/2103.00020"},{"key":"8_CR64","unstructured":"Ramachandran, P., Parmar, N., Vaswani, A., Bello, I., Levskaya, A., Shlens, J.: Stand-alone self-attention in vision models (2019). https:\/\/arxiv.org\/abs\/1906.05909"},{"key":"8_CR65","doi-asserted-by":"crossref","unstructured":"Ranftl, R., Bochkovskiy, A., Koltun, V.: Vision transformers for dense prediction (2021). https:\/\/arxiv.org\/abs\/2103.13413","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"8_CR66","doi-asserted-by":"crossref","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-net: convolutional networks for biomedical image segmentation (2015). https:\/\/arxiv.org\/abs\/1505.04597","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"8_CR67","unstructured":"Roy, S., et al.: Mednext: transformer-driven scaling of convnets for medical image segmentation (2024). https:\/\/arxiv.org\/abs\/2303.09975"},{"key":"8_CR68","unstructured":"Ryali, C., et al.: Hiera: a hierarchical vision transformer without the bells-and-whistles (2023). https:\/\/arxiv.org\/abs\/2306.00989"},{"key":"8_CR69","unstructured":"Shen, Z., Zhang, M., Zhao, H., Yi, S., Li, H.: Efficient attention: attention with linear complexities. CoRR abs\/1812.01243 (2018). http:\/\/arxiv.org\/abs\/1812.01243"},{"key":"8_CR70","unstructured":"Simpson, A.L., et al.: A large annotated medical image dataset for the development and evaluation of segmentation algorithms (2019). https:\/\/arxiv.org\/abs\/1902.09063"},{"key":"8_CR71","doi-asserted-by":"crossref","unstructured":"Sun, G., Liu, Y., Ding, H., Wu, M., Van\u00a0Gool, L.: Learning local and global temporal contexts for video semantic segmentation. IEEE Trans. Pattern Anal. Mach. Intell. (2024)","DOI":"10.1109\/TPAMI.2024.3387326"},{"key":"8_CR72","doi-asserted-by":"crossref","unstructured":"Sun, G., Liu, Y., Probst, T., Paudel, D.P., Popovic, N., Van\u00a0Gool, L.: Rethinking global context in crowd counting. Mach. Intell. Res. 1\u201312 (2024)","DOI":"10.1007\/s11633-023-1475-z"},{"key":"8_CR73","unstructured":"Tao, C., et al.: Siamese image modeling for self-supervised vision representation learning (2022). https:\/\/arxiv.org\/abs\/2206.01204"},{"key":"8_CR74","unstructured":"Tong, Z., Song, Y., Wang, J., Wang, L.: Videomae: masked autoencoders are data-efficient learners for self-supervised video pre-training. In: Advance in Neural Information Processing System, vol. 35, pp. 10078\u201310093 (2022)"},{"key":"8_CR75","unstructured":"Vaswani, A.: Attention is all you need. In: Advance in Neural Information Processing System (2017)"},{"key":"8_CR76","unstructured":"Vorontsov, E., et\u00a0al.: Virchow: a million-slide digital pathology foundation model. arXiv preprint (2023). arXiv:2309.07778"},{"key":"8_CR77","unstructured":"Wang, H., et al.: Sam-med3d: towards general-purpose segmentation models for volumetric medical images (2024). https:\/\/arxiv.org\/abs\/2310.15161"},{"key":"8_CR78","doi-asserted-by":"crossref","unstructured":"Wang, L., et al.: Videomae v2: scaling video masked autoencoders with dual masking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14549\u201314560 (2023)","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"8_CR79","doi-asserted-by":"crossref","unstructured":"Wang, R., et al.: Bevt: bert pretraining of video transformers (2022). https:\/\/arxiv.org\/abs\/2112.01529","DOI":"10.1109\/CVPR52688.2022.01432"},{"key":"8_CR80","unstructured":"Wang, S., Li, B.Z., Khabsa, M., Fang, H., Ma, H.: Linformer: self-attention with linear complexity (2020)"},{"key":"8_CR81","unstructured":"Wang, Y., et\u00a0al.: STS MICCAI 2023 challenge: grand challenge on 2D and 3D semi-supervised tooth segmentation. arXiv preprint (2024). arXiv:2407.13246"},{"key":"8_CR82","doi-asserted-by":"crossref","unstructured":"Wang, Z., Wu, Z., Agarwal, D., Sun, J.: Medclip: contrastive learning from unpaired medical images and text (2022). https:\/\/arxiv.org\/abs\/2210.10163","DOI":"10.18653\/v1\/2022.emnlp-main.256"},{"issue":"1","key":"8_CR83","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1148\/radiol.2020192224","volume":"295","author":"MJ Willemink","year":"2020","unstructured":"Willemink, M.J., et al.: Preparing medical imaging data for machine learning. Radiology 295(1), 4\u201315 (2020)","journal-title":"Radiology"},{"key":"8_CR84","doi-asserted-by":"crossref","unstructured":"Wu, H., et al.: Cvt: introducing convolutions to vision transformers (2021). https:\/\/arxiv.org\/abs\/2103.15808","DOI":"10.1109\/ICCV48922.2021.00009"},{"key":"8_CR85","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1007\/978-3-030-87199-4_16","volume-title":"Medical Image Computing and Computer Assisted Intervention \u2013 MICCAI 2021","author":"Y Xie","year":"2021","unstructured":"Xie, Y., Zhang, J., Shen, C., Xia, Y.: CoTr: efficiently bridging CNN and transformer for 3D medical image segmentation. In: de Bruijne, M., et al. (eds.) MICCAI 2021. LNCS, vol. 12903, pp. 171\u2013180. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-87199-4_16"},{"key":"8_CR86","unstructured":"Xie, Z., et al.: Simmim: a simple framework for masked image modeling (2022). https:\/\/arxiv.org\/abs\/2111.09886"},{"key":"8_CR87","unstructured":"Xie, Z., et al.: On data scaling in masked image modeling (2022). https:\/\/arxiv.org\/abs\/2206.04664"},{"key":"8_CR88","unstructured":"Xu, H., Ding, S., Zhang, X., Xiong, H., Tian, Q.: Masked autoencoders are robust data augmentors. arXiv preprint (2022). arXiv:2206.04846"},{"key":"8_CR89","doi-asserted-by":"crossref","unstructured":"Yu, X., Tang, L., Rao, Y., Huang, T., Zhou, J., Lu, J.: Point-bert: pre-training 3D point cloud transformers with masked point modeling (2022). https:\/\/arxiv.org\/abs\/2111.14819","DOI":"10.1109\/CVPR52688.2022.01871"},{"key":"8_CR90","doi-asserted-by":"crossref","unstructured":"Zhai, X., Kolesnikov, A., Houlsby, N., Beyer, L.: Scaling vision transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12104\u201312113 (2022)","DOI":"10.1109\/CVPR52688.2022.01179"},{"key":"8_CR91","doi-asserted-by":"publisher","first-page":"7834","DOI":"10.1109\/TIP.2020.3006377","volume":"29","author":"Y Zhang","year":"2020","unstructured":"Zhang, Y., et al.: Collaborative unsupervised domain adaptation for medical image diagnosis. IEEE Trans. Image Process. 29, 7834\u20137844 (2020)","journal-title":"IEEE Trans. Image Process."},{"issue":"1","key":"8_CR92","doi-asserted-by":"publisher","first-page":"380","DOI":"10.1038\/s41597-023-02237-5","volume":"10","author":"Y Zhang","year":"2023","unstructured":"Zhang, Y., et al.: Children\u2019s dental panoramic radiographs dataset for caries segmentation and dental disease detection. Sci. Data 10(1), 380 (2023)","journal-title":"Sci. Data"},{"key":"8_CR93","doi-asserted-by":"publisher","first-page":"4036","DOI":"10.1109\/TIP.2023.3293771","volume":"32","author":"HY Zhou","year":"2023","unstructured":"Zhou, H.Y., et al.: nnFormer: volumetric medical image segmentation via a 3D transformer. IEEE Trans. Image Process. 32, 4036\u20134045 (2023)","journal-title":"IEEE Trans. Image Process."},{"key":"8_CR94","unstructured":"Zhu, J., Qi, Y., Wu, J.: Medical sam 2: segment medical images as video via segment anything model 2. arXiv preprint (2024). arXiv:2408.00874"}],"container-title":["Lecture Notes in Computer Science","Supervised and Semi-supervised Multi-structure Segmentation and Landmark Detection in Dental Data"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-88977-6_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,16]],"date-time":"2025-05-16T05:23:21Z","timestamp":1747373001000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-88977-6_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031889769","9783031889776"],"references-count":94,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-88977-6_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"17 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"MICCAI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Medical Image Computing and Computer-Assisted Intervention","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Marrakesh","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Morocco","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"miccai2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/conferences.miccai.org\/2024\/en\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}