{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,14]],"date-time":"2026-02-14T14:04:10Z","timestamp":1771077850417,"version":"3.50.1"},"reference-count":122,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100002858","name":"China Postdoctoral Science Foundation","doi-asserted-by":"publisher","award":["2024M751192"],"award-info":[{"award-number":["2024M751192"]}],"id":[{"id":"10.13039\/501100002858","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100012557","name":"Six Talent Climax Foundation of Jiangsu","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100012557","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100016105","name":"Project on Maternal and Child Health Talents of Jiangsu Province","doi-asserted-by":"publisher","award":["F202322"],"award-info":[{"award-number":["F202322"]}],"id":[{"id":"10.13039\/100016105","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62276116"],"award-info":[{"award-number":["62276116"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100010014","name":"Six Talent Peaks Project in Jiangsu Province","doi-asserted-by":"publisher","award":["DZXX-122"],"award-info":[{"award-number":["DZXX-122"]}],"id":[{"id":"10.13039\/501100010014","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Expert Systems with Applications"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1016\/j.eswa.2026.131462","type":"journal-article","created":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T00:31:36Z","timestamp":1770337896000},"page":"131462","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Transformer-masked autoencoder (MAE) for robust medical image classification: A comprehensive survey"],"prefix":"10.1016","volume":"313","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-1427-8195","authenticated-orcid":false,"given":"Ernest","family":"Asimeng","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5406-0621","authenticated-orcid":false,"given":"Jun","family":"Chen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7091-7717","authenticated-orcid":false,"given":"Kai","family":"Han","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0493-634X","authenticated-orcid":false,"given":"Chongwen","family":"Lyu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8454-9468","authenticated-orcid":false,"given":"Zhe","family":"Liu","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"issue":"6","key":"10.1016\/j.eswa.2026.131462_bib0001","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1007\/s10278-025-01481-y","article-title":"Vision transformers in medical imaging: A comprehensive review of advancements and applications across multiple diseases","volume":"38","author":"Aburass","year":"2025","journal-title":"Journal of Imaging Informatics in Medicine"},{"key":"10.1016\/j.eswa.2026.131462_bib0002","series-title":"2024 IEEE 15th annual ubiquitous computing, electronics & mobile communication conference (UEMCON)","first-page":"720","article-title":"Optimizing breast cancer classification with ensemble methods: A focus on bagging techniques","author":"Almalki","year":"2024"},{"key":"10.1016\/j.eswa.2026.131462_bib0003","doi-asserted-by":"crossref","DOI":"10.1016\/j.compbiomed.2024.108841","article-title":"Cross-corpus speech emotion recognition with transformers: Leveraging handcrafted features and data augmentation","volume":"179","author":"Alroobaea","year":"2024","journal-title":"Computers in Biology and Medicine"},{"key":"10.1016\/j.eswa.2026.131462_bib0004","unstructured":"Arasteh, S. T., Shaigan, M., Kuhl, C., Kather, J. N., Nebelung, S., & Truhn, D. (2025a). Resolution scaling governs dinov3 transfer performance in chest radiograph classification. arXiv: 2510.07191https:\/\/arxiv.org\/abs\/2510.07191."},{"key":"10.1016\/j.eswa.2026.131462_bib0005","unstructured":"Arasteh, S. T., Shaigan, M., Kuhl, C., Kather, J. N., Nebelung, S., & Truhn, D. (2025b). Resolution scaling governs dinov3 transfer performance in chest radiograph classification. arXiv preprint arXiv:2510.07191."},{"key":"10.1016\/j.eswa.2026.131462_bib0006","unstructured":"Baharoon, M., Qureshi, W., Ouyang, J., Xu, Y., Aljouie, A., & Peng, W. (2024). Evaluating general purpose vision foundation models for medical image analysis: An experimental study of dinov2 on radiology benchmarks. arXiv: 2312.02366https:\/\/arxiv.org\/abs\/2312.02366."},{"key":"10.1016\/j.eswa.2026.131462_bib0007","doi-asserted-by":"crossref","DOI":"10.1016\/j.cmpb.2023.107936","article-title":"Masked autoencoders with handcrafted feature predictions: Transformer for weakly supervised esophageal cancer classification","volume":"244","author":"Bai","year":"2024","journal-title":"Computer Methods and Programs in Biomedicine"},{"key":"10.1016\/j.eswa.2026.131462_bib0008","doi-asserted-by":"crossref","first-page":"28","DOI":"10.1007\/s10462-023-10662-6","article-title":"Autoencoders and their applications in machine learning: A survey","volume":"57","author":"Berahmand","year":"2024","journal-title":"Artificial Intelligence Review"},{"key":"10.1016\/j.eswa.2026.131462_bib0009","doi-asserted-by":"crossref","DOI":"10.1016\/j.rvsc.2024.105317","article-title":"Artificial intelligence in veterinary diagnostic imaging: Perspectives and limitations","volume":"175","author":"Burti","year":"2024","journal-title":"Research in Veterinary Science"},{"key":"10.1016\/j.eswa.2026.131462_bib0010","doi-asserted-by":"crossref","first-page":"167","DOI":"10.1080\/17435889.2024.2439776","article-title":"Nanomedicine and clinical diagnostics part I: Applications in conventional imaging (MRI, X-ray\/CT, and ultrasound)","volume":"20","author":"Butt","year":"2025","journal-title":"Nanomedicine"},{"key":"10.1016\/j.eswa.2026.131462_bib0011","doi-asserted-by":"crossref","DOI":"10.1016\/j.bspc.2024.106131","article-title":"MAE-EEG-transformer: A transformer-based approach combining masked autoencoder and cross-individual data augmentation pre-training for eeg classification","volume":"94","author":"Cai","year":"2024","journal-title":"Biomedical Signal Processing and Control"},{"key":"10.1016\/j.eswa.2026.131462_bib0012","series-title":"Enhancing Transformer Architectures for Dialogue Modelling Through Contextual Reencoding","author":"Caldarini","year":"2025"},{"key":"10.1016\/j.eswa.2026.131462_bib0013","unstructured":"Cao, S., Xu, P., & Clifton, D. A. (2022). How to understand masked autoencoders. arXiv preprint arXiv:2202.03670."},{"key":"10.1016\/j.eswa.2026.131462_bib0014","series-title":"Exploring Label Efficiency with Semi-Supervision and Self-Supervision Methods","author":"Cerqueira","year":"2024"},{"key":"10.1016\/j.eswa.2026.131462_bib0015","series-title":"International conference on machine learning","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","author":"Chen","year":"2020"},{"key":"10.1016\/j.eswa.2026.131462_bib0016","doi-asserted-by":"crossref","DOI":"10.1016\/j.media.2022.102444","article-title":"Recent advances and clinical applications of deep learning in medical image analysis","volume":"79","author":"Chen","year":"2022","journal-title":"Medical Image Analysis"},{"key":"10.1016\/j.eswa.2026.131462_bib0017","unstructured":"Chiocchetti, A., Dossena, M., Irwin, C., & Portinale, L. (2024). Beyond labels: A self-supervised framework with masked autoencoders and random cropping for breast cancer subtype classification. arXiv preprint arXiv:2410.12006."},{"key":"10.1016\/j.eswa.2026.131462_bib0018","doi-asserted-by":"crossref","unstructured":"Das, B. K., Zhao, G., Liu, H., Re, T. J., Comaniciu, D., Gibson, E., & Maier, A. (2025). Self pre-training with adaptive mask autoencoders for variable-contrast 3D medical imaging. arXiv preprint arXiv:2501.09096.","DOI":"10.1109\/ISBI60581.2025.10981097"},{"key":"10.1016\/j.eswa.2026.131462_bib0019","doi-asserted-by":"crossref","unstructured":"Dhinagar, N. J., Thomopoulos, S. I., & Thompson, P. M. (2025). Leveraging a vision-language model with natural text supervision for MRI retrieval, captioning, classification, and visual question answering. bioRxiv, (pp. 2025\u20132027).","DOI":"10.1101\/2025.02.15.638446"},{"key":"10.1016\/j.eswa.2026.131462_bib0020","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.111680","article-title":"Improving imbalanced medical image classification through gan-based data augmentation methods","volume":"166","author":"Ding","year":"2025","journal-title":"Pattern Recognition"},{"key":"10.1016\/j.eswa.2026.131462_bib0021","doi-asserted-by":"crossref","unstructured":"Djoumessi, K., Mensah, S. O., & Berens, P. (2025). A hybrid fully convolutional cnn-transformer model for inherently interpretable medical image classification. arXiv preprint arXiv:2504.08481.","DOI":"10.1007\/978-3-032-17611-0_11"},{"key":"10.1016\/j.eswa.2026.131462_bib0022","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., & et al. (2020). An image is worth 16\u202f\u00d7\u202f16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929."},{"key":"10.1016\/j.eswa.2026.131462_bib0023","first-page":"89","article-title":"A nonparametric \u201ctrim and fill\u201d method of accounting for publication bias in meta-analysis","volume":"95","author":"Duval","year":"2000","journal-title":"Journal of the American Statistical Association"},{"key":"10.1016\/j.eswa.2026.131462_bib0024","unstructured":"Falconnier, P. (2025). Representation learning for 3D brain imaging: A comparative study of pre-trained encoders, foundation models and self-supervised learning methods."},{"key":"10.1016\/j.eswa.2026.131462_bib0025","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"24449","article-title":"Masked auto-encoders meet generative adversarial networks and beyond","author":"Fei","year":"2023"},{"key":"10.1016\/j.eswa.2026.131462_bib0026","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1007\/s10489-024-06029-1","article-title":"A swin-transformer-based network with inductive bias ability for medical image segmentation","volume":"55","author":"Gao","year":"2025","journal-title":"Applied Intelligence"},{"key":"10.1016\/j.eswa.2026.131462_bib0027","doi-asserted-by":"crossref","unstructured":"Gupta, A., Osman, I., Shehata, M. S., & Braun, J. W. (2024). Medmae: A self-supervised backbone for medical imaging tasks. arXiv preprint arXiv:2407.14784.","DOI":"10.21203\/rs.3.rs-5028318\/v1"},{"issue":"6","key":"10.1016\/j.eswa.2026.131462_bib0028","first-page":"1","article-title":"Medmae: A self-supervised backbone for medical imaging tasks","volume":"13","author":"Gupta","year":"2025","journal-title":"Computation"},{"key":"10.1016\/j.eswa.2026.131462_bib0029","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2023.123052","article-title":"Deep semi-supervised learning for medical image segmentation: A review","volume":"245","author":"Han","year":"2024","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.eswa.2026.131462_bib0030","series-title":"2024 27th international conference on computer supported cooperative work in design","first-page":"2046","article-title":"Mat-vit: A vision transformer with mae-based self-supervised auxiliary task for medical image classification","author":"Han","year":"2024"},{"key":"10.1016\/j.eswa.2026.131462_bib0031","series-title":"2024 6th international conference on electrical engineering and information & communication technology (ICEEICT)","first-page":"1286","article-title":"Swinmednet: Leveraging swin transformer for robust diabetic retinopathy classification from the retinamnist2d dataset","author":"Haque","year":"2024"},{"key":"10.1016\/j.eswa.2026.131462_bib0032","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"16000","article-title":"Masked autoencoders are scalable vision learners","author":"He","year":"2022"},{"key":"10.1016\/j.eswa.2026.131462_bib0033","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"9729","article-title":"Momentum contrast for unsupervised visual representation learning","author":"He","year":"2020"},{"key":"10.1016\/j.eswa.2026.131462_bib0034","first-page":"248","article-title":"Foundation model for advancing healthcare: challenges, opportunities and future directions","volume":"17","author":"He","year":"2024","journal-title":"IEEE Reviews in Biomedical Engineering"},{"key":"10.1016\/j.eswa.2026.131462_bib0035","first-page":"5149","article-title":"Meta-learning in neural networks: A survey","volume":"44","author":"Hospedales","year":"2021","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.eswa.2026.131462_bib0036","doi-asserted-by":"crossref","first-page":"74","DOI":"10.1038\/s41746-023-00811-0","article-title":"Self-supervised learning for medical image classification: A systematic review and implementation guidelines","volume":"6","author":"Huang","year":"2023","journal-title":"NPJ Digital Medicine"},{"key":"10.1016\/j.eswa.2026.131462_bib0037","unstructured":"Huang, Y., Xu, J., Lai, J., Jiang, Z., Chen, T., Li, Z., Yao, Y., Ma, X., Yang, L., Chen, H., & et al. (2023b). Advancing transformer architecture in long-context large language models: A comprehensive survey. arXiv preprint arXiv:2311.12351."},{"key":"10.1016\/j.eswa.2026.131462_bib0038","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/TGRS.2022.3217892","article-title":"Masked auto-encoding spectral-spatial transformer for hyperspectral image classification","volume":"60","author":"Ibanez","year":"2022","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.131462_bib0039","doi-asserted-by":"crossref","first-page":"1589","DOI":"10.1007\/s12559-020-09787-5","article-title":"Pneumonia classification using deep learning from chest X-ray images during COVID-19","volume":"16","author":"Ibrahim","year":"2024","journal-title":"Cognitive Computation"},{"key":"10.1016\/j.eswa.2026.131462_bib0040","unstructured":"Ikezogwo, W. O., Seyfioglu, M. S., & Shapiro, L. (2022). Multi-modal masked autoencoders learn compositional histopathological representations. arXiv preprint arXiv:2209.01534."},{"key":"10.1016\/j.eswa.2026.131462_bib0041","series-title":"International conference on medical image computing and computer-assisted intervention","first-page":"352","article-title":"Deblurring masked autoencoder is better recipe for ultrasound image recognition","author":"Kang","year":"2023"},{"key":"10.1016\/j.eswa.2026.131462_bib0042","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3505244","article-title":"Transformers in vision: A survey","volume":"54","author":"Khan","year":"2022","journal-title":"ACM Computing Surveys (CSUR)"},{"key":"10.1016\/j.eswa.2026.131462_bib0043","series-title":"Towards Robust Deep Learning for Medical Imaging With Limited and Noisy Labeled Data","author":"Khanal","year":"2025"},{"key":"10.1016\/j.eswa.2026.131462_bib0044","doi-asserted-by":"crossref","first-page":"69","DOI":"10.1186\/s12880-022-00793-7","article-title":"Transfer learning for medical image classification: A literature review","volume":"22","author":"Kim","year":"2022","journal-title":"BMC Medical Imaging"},{"key":"10.1016\/j.eswa.2026.131462_bib0045","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"7918","article-title":"Understanding masked autoencoders via hierarchical latent variable models","author":"Kong","year":"2023"},{"key":"10.1016\/j.eswa.2026.131462_bib0046","doi-asserted-by":"crossref","first-page":"1348","DOI":"10.1109\/JSTARS.2023.3337132","article-title":"Instructional mask autoencoder: A scalable learner for hyperspectral image classification","volume":"17","author":"Kong","year":"2023","journal-title":"IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.131462_bib0047","doi-asserted-by":"crossref","unstructured":"Le, T.-D., Nguyen, T. T., Ha, V. N., Chatzinotas, S., Jouvet, P., & Noumeir, R. (2024). The impact of lora adapters for llms on clinical nlp classification under data limitations. arXiv preprint arXiv:2407.19299.","DOI":"10.1109\/ACCESS.2025.3582037"},{"key":"10.1016\/j.eswa.2026.131462_bib0048","unstructured":"Lee, H. H., Gu, Y., Zhao, T., Xu, Y., Yang, J., Usuyama, N., Wong, C., Wei, M., Landman, B. A., Huo, Y., & et al. (2024). Foundation models for biomedical image segmentation: A survey. arXiv preprint arXiv:2401.07654."},{"key":"10.1016\/j.eswa.2026.131462_bib0049","unstructured":"Li, H., Shui, Z., Zhang, Y., Zhu, C., & Yang, L. (2025). Pathvq: Reforming computational pathology foundation model for whole slide image analysis via vector quantization. arXiv preprint arXiv:2503.06482."},{"key":"10.1016\/j.eswa.2026.131462_bib0050","doi-asserted-by":"crossref","first-page":"231","DOI":"10.1007\/s40846-024-00863-x","article-title":"Role of artificial intelligence in medical image analysis: A review of current trends and future directions","volume":"44","author":"Li","year":"2024","journal-title":"Journal of Medical and Biological Engineering"},{"key":"10.1016\/j.eswa.2026.131462_bib0051","doi-asserted-by":"crossref","first-page":"2888","DOI":"10.1109\/TVCG.2023.3261935","article-title":"How does attention work in vision transformers? A visual analytics attempt","volume":"29","author":"Li","year":"2023","journal-title":"IEEE Transactions on Visualization and Computer Graphics"},{"key":"10.1016\/j.eswa.2026.131462_bib0052","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1007\/s11760-025-04169-6","article-title":"Msmae-net: Multi-semantic and multi-attention enhanced network for image forgery localization","volume":"19","author":"Liao","year":"2025","journal-title":"Signal, Image and Video Processing"},{"key":"10.1016\/j.eswa.2026.131462_bib0053","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"10012","article-title":"Swin transformer: Hierarchical vision transformer using shifted windows","author":"Liu","year":"2021"},{"key":"10.1016\/j.eswa.2026.131462_bib0054","series-title":"International workshop on machine learning in medical imaging","first-page":"95","article-title":"Vis-mae: An efficient self-supervised learning approach on medical image segmentation and classification","author":"Liu","year":"2024"},{"key":"10.1016\/j.eswa.2026.131462_bib0055","doi-asserted-by":"crossref","first-page":"494","DOI":"10.23887\/janapati.v13i3.81425","article-title":"Classification of lung diseases in X-ray images using transformer-based deep learning models","volume":"13","author":"Mahajaya","year":"2024","journal-title":"Jurnal Nasional Pendidikan Teknik Informatika: JANAPATI"},{"key":"10.1016\/j.eswa.2026.131462_bib0056","article-title":"Medical supervised masked autoencoder: Crafting a better masking strategy and efficient fine-tuning schedule for medical image classification","volume":"162","author":"Mao","year":"2024","journal-title":"Applied Soft Computing"},{"key":"10.1016\/j.eswa.2026.131462_bib0057","doi-asserted-by":"crossref","DOI":"10.1016\/j.media.2023.102868","article-title":"Cross-dimensional transfer learning in medical image segmentation with deep learning","volume":"88","author":"Messaoudi","year":"2023","journal-title":"Medical Image Analysis"},{"key":"10.1016\/j.eswa.2026.131462_bib0058","series-title":"Representation learning of multimodal and longitudinal data for renal transplantation monitoring","author":"Milecki","year":"2024"},{"key":"10.1016\/j.eswa.2026.131462_bib0059","doi-asserted-by":"crossref","DOI":"10.1038\/s41598-025-09041-8","article-title":"Medical slice transformer for improved diagnosis and explainability on 3d medical images with dinov2","volume":"15","author":"M\u00fcller-Franzes","year":"2025","journal-title":"Scientific Reports"},{"key":"10.1016\/j.eswa.2026.131462_bib0060","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1186\/s40537-025-01196-5","article-title":"Meta-transformer: Leveraging metaheuristic algorithms for agricultural commodity price forecasting","volume":"12","author":"Nayak","year":"2025","journal-title":"Journal of Big Data"},{"key":"10.1016\/j.eswa.2026.131462_bib0061","unstructured":"Nerella, S., Bandyopadhyay, S., Zhang, J., Contreras, M., Siegel, S., Bumin, A., Silva, B., Sena, J., Shickel, B., Bihorac, A., & et al. (2023). Transformers in healthcare: A survey. arXiv preprint arXiv:2307.00067."},{"key":"10.1016\/j.eswa.2026.131462_bib0062","doi-asserted-by":"crossref","first-page":"282","DOI":"10.3390\/diagnostics15030282","article-title":"Artificial intelligence-empowered radiology-current status and critical review","volume":"15","author":"Obuchowicz","year":"2025","journal-title":"Diagnostics"},{"key":"10.1016\/j.eswa.2026.131462_bib0063","doi-asserted-by":"crossref","unstructured":"Obuchowicz, R., Strzelecki, M., & Pi\u00f3rkowski, A. (2024). Clinical applications of artificial intelligence in medical imaging and image processing\u2014A review.","DOI":"10.3390\/books978-3-7258-1260-8"},{"key":"10.1016\/j.eswa.2026.131462_bib0064","first-page":"2319","article-title":"Emerging trends in ai-powered medical imaging: enhancing diagnostic accuracy and treatment decisions","volume":"13","author":"Oyeniyi","year":"2024","journal-title":"International Journal of Enhanced Research In Science Technology & Engineering"},{"key":"10.1016\/j.eswa.2026.131462_bib0065","doi-asserted-by":"crossref","DOI":"10.1016\/j.compbiomed.2024.108554","article-title":"Kidney tumor classification on ct images using self-supervised learning","volume":"176","author":"\u00d6zbay","year":"2024","journal-title":"Computers in Biology and Medicine"},{"key":"10.1016\/j.eswa.2026.131462_bib0066","series-title":"Detection, quantification, malignancy prediction and growth forecasting of pulmonary nodules using deep learning in follow-up CT scans","author":"Palou","year":"2021"},{"key":"10.1016\/j.eswa.2026.131462_bib0067","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2023.106126","article-title":"Vision transformers in medical computer vision-a contemplative retrospection","volume":"122","author":"Parvaiz","year":"2023","journal-title":"Engineering Applications of Artificial Intelligence"},{"key":"10.1016\/j.eswa.2026.131462_bib0068","series-title":"Flexible body-conformal ultrasound systems for autonomous image-guided neuromodulation","author":"Pashaei","year":"2021"},{"key":"10.1016\/j.eswa.2026.131462_bib0069","doi-asserted-by":"crossref","DOI":"10.1117\/1.JMI.10.5.051804","article-title":"Regulatory considerations for medical imaging AI\/ML devices in the united states: Concepts and challenges","volume":"10","author":"Petrick","year":"2023","journal-title":"Journal of Medical Imaging"},{"key":"10.1016\/j.eswa.2026.131462_bib0070","unstructured":"Qiu, Z., Chao, H., Lin, T., Chang, W., Yang, Z., Jiao, W., Shen, Y., Zhang, Y., Yang, Y., Liu, W., & et al. (2024). From pixels to gigapixels: Bridging local inductive bias and long-range dependencies with pixel-mamba. arXiv preprint arXiv:2412.16711."},{"key":"10.1016\/j.eswa.2026.131462_bib0071","doi-asserted-by":"crossref","first-page":"1607","DOI":"10.1007\/s12530-024-09581-w","article-title":"Self-supervised learning for medical image analysis: a comprehensive review","volume":"15","author":"Rani","year":"2024","journal-title":"Evolving Systems"},{"key":"10.1016\/j.eswa.2026.131462_bib0072","series-title":"Medical imaging 2025: Computer-aided diagnosis","first-page":"912","article-title":"Vision transformer for efficient chest X-ray and gastrointestinal image classification","volume":"13407","author":"Regmi","year":"2025"},{"key":"10.1016\/j.eswa.2026.131462_bib0073","unstructured":"Richemond, P. H., Grill, J.-B., Altch\u00e9, F., Tallec, C., Strub, F., Brock, A., Smith, S., De, S., Pascanu, R., Piot, B., & et al. (2020). Byol works even without batch statistics. arXiv preprint arXiv:2010.10241."},{"key":"10.1016\/j.eswa.2026.131462_bib0074","series-title":"International symposium on visual computing","first-page":"320","article-title":"Dino-cxr: A self supervised method based on vision transformer for chest X-ray classification","author":"Shakouri","year":"2023"},{"key":"10.1016\/j.eswa.2026.131462_bib0075","doi-asserted-by":"crossref","DOI":"10.1016\/j.media.2023.102802","article-title":"Transformers in medical imaging: A survey","volume":"88","author":"Shamshad","year":"2023","journal-title":"Medical Image Analysis"},{"key":"10.1016\/j.eswa.2026.131462_bib0076","series-title":"International conference on sustainability innovation in computing and engineering ICSICE 2024","first-page":"1343","article-title":"Comparative analysis of vision transformer and CNN architectures in medical image classification","author":"Sharma","year":"2025"},{"key":"10.1016\/j.eswa.2026.131462_bib0077","series-title":"International conference on medical image computing and computer-assisted intervention","first-page":"273","article-title":"Mora: Lora guided multi-modal disease diagnosis with missing modality","author":"Shi","year":"2024"},{"key":"10.1016\/j.eswa.2026.131462_bib0078","doi-asserted-by":"crossref","DOI":"10.1016\/j.sigpro.2024.109683","article-title":"Integrating self-attention mechanisms in deep learning: A novel dual-head ensemble transformer with its application to bearing fault diagnosis","volume":"227","author":"Snyder","year":"2025","journal-title":"Signal Processing"},{"key":"10.1016\/j.eswa.2026.131462_bib0079","article-title":"Mambamim: Pre-training mamba with state space token interpolation and its application to medical image segmentation","volume":"100","author":"Tang","year":"2025","journal-title":"Medical Image Analysis"},{"key":"10.1016\/j.eswa.2026.131462_bib0080","series-title":"International conference on medical image computing and computer-assisted intervention","first-page":"330","article-title":"Hyspark: Hybrid sparse masking for large scale medical image pre-training","author":"Tang","year":"2024"},{"key":"10.1016\/j.eswa.2026.131462_bib0081","doi-asserted-by":"crossref","unstructured":"Tang, F., Yao, Q., Ma, W., Wu, C., Jiang, Z., & Zhou, S. K. (2025b). Hi-end-mae: Hierarchical encoder-driven masked autoencoders are stronger vision learners for medical image segmentation. arXiv preprint arXiv:2502.08347.","DOI":"10.1016\/j.media.2025.103770"},{"key":"10.1016\/j.eswa.2026.131462_bib0082","doi-asserted-by":"crossref","DOI":"10.1016\/j.bspc.2023.105605","article-title":"Htc-net: A hybrid cnn-transformer framework for medical image segmentation","volume":"88","author":"Tang","year":"2024","journal-title":"Biomedical Signal Processing and Control"},{"key":"10.1016\/j.eswa.2026.131462_bib0083","series-title":"Proceedings of machine learning for health","first-page":"54","article-title":"How transferable are self-supervised features in medical image classification tasks?","volume":"158","author":"Truong","year":"2021"},{"key":"10.1016\/j.eswa.2026.131462_bib0084","doi-asserted-by":"crossref","first-page":"542","DOI":"10.3390\/diagnostics14050542","article-title":"Ultrasound image analysis with vision transformers","volume":"14","author":"Vafaeezadeh","year":"2024","journal-title":"Diagnostics"},{"key":"10.1016\/j.eswa.2026.131462_bib0085","doi-asserted-by":"crossref","DOI":"10.1016\/j.compbiomed.2023.107336","article-title":"Pyramid-based self-supervised learning for histopathological image classification","volume":"165","author":"Wang","year":"2023","journal-title":"Computers in Biology and Medicine"},{"key":"10.1016\/j.eswa.2026.131462_bib0086","article-title":"Mcswin: A self-supervised learning framework combining mask image modeling with contrastive learning for cervical oct image classification","author":"Wang","year":"2024","journal-title":"Authorea Preprints"},{"key":"10.1016\/j.eswa.2026.131462_bib0087","doi-asserted-by":"crossref","first-page":"8802","DOI":"10.3390\/s23218802","article-title":"Swin transformer-based edge guidance network for RGB-D salient object detection","volume":"23","author":"Wang","year":"2023","journal-title":"Sensors"},{"key":"10.1016\/j.eswa.2026.131462_bib0088","first-page":"14316","article-title":"Hsimae: A unified masked autoencoder with large-scale pre-training for hyperspectral image classification","volume":"17","author":"Wang","year":"2024","journal-title":"IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.131462_bib0089","doi-asserted-by":"crossref","DOI":"10.1038\/s41598-023-46433-0","article-title":"Self-supervised pre-training with contrastive and masked autoencoder methods for dealing with small datasets in deep learning for medical imaging","volume":"13","author":"Wolf","year":"2023","journal-title":"Scientific Reports"},{"key":"10.1016\/j.eswa.2026.131462_bib0090","article-title":"Attention-based saliency maps improve interpretability of pneumothorax classification","volume":"5","author":"Wollek","year":"2023","journal-title":"Radiology: Artificial Intelligence"},{"issue":"10","key":"10.1016\/j.eswa.2026.131462_bib0091","first-page":"3674","article-title":"Pan-cancer histopathology wsi pre-training with position-aware masked autoencoder","volume":"43","author":"Wu","year":"2024","journal-title":"IEEE Transactions on Medical Imaging"},{"key":"10.1016\/j.eswa.2026.131462_bib0092","series-title":"International conference on medical image computing and computer-assisted intervention","first-page":"714","article-title":"Position-aware masked autoencoder for histopathology wsi representation learning","author":"Wu","year":"2023"},{"issue":"2","key":"10.1016\/j.eswa.2026.131462_bib0093","first-page":"812","article-title":"Large-scale 3d medical image pre-training with geometric context priors","volume":"47","author":"Wu","year":"2025","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.eswa.2026.131462_bib0094","series-title":"Proceedings of the IEEE\/CVF winter conference on applications of computer vision","first-page":"3588","article-title":"Delving into masked autoencoders for multi-label thorax disease classification","author":"Xiao","year":"2023"},{"key":"10.1016\/j.eswa.2026.131462_bib0095","doi-asserted-by":"crossref","DOI":"10.3389\/fmed.2023.1114571","article-title":"Mae-transrnet: An improved transformer-convnet architecture with masked autoencoder for cardiac MRI registration","volume":"10","author":"Xiao","year":"2023","journal-title":"Frontiers in Medicine"},{"key":"10.1016\/j.eswa.2026.131462_bib0096","series-title":"International conference on medical image computing and computer-assisted intervention","first-page":"13","article-title":"Medim: Boost medical image representation via radiology report-guided masking","author":"Xie","year":"2023"},{"key":"10.1016\/j.eswa.2026.131462_bib0097","doi-asserted-by":"crossref","DOI":"10.1016\/j.media.2024.103304","article-title":"Rethinking masked image modelling for medical image representation","volume":"98","author":"Xie","year":"2024","journal-title":"Medical Image Analysis"},{"key":"10.1016\/j.eswa.2026.131462_bib0098","doi-asserted-by":"crossref","first-page":"901","DOI":"10.3390\/bioengineering10080901","article-title":"Self-supervised learning application on COVID-19 chest X-ray image classification using masked autoencoder","volume":"10","author":"Xing","year":"2023","journal-title":"Bioengineering"},{"key":"10.1016\/j.eswa.2026.131462_bib0099","unstructured":"Xu, J., & Stirenko, S. (2022). Self-supervised model based on masked autoencoders advance ct scans classification. arXiv preprint arXiv:2210.05073."},{"key":"10.1016\/j.eswa.2026.131462_bib0100","doi-asserted-by":"crossref","first-page":"214","DOI":"10.1109\/OJEMB.2024.3374966","article-title":"Masked modeling-based ultrasound image classification via self-supervised learning","volume":"5","author":"Xu","year":"2024","journal-title":"IEEE Open Journal of Engineering in Medicine and Biology"},{"key":"10.1016\/j.eswa.2026.131462_bib0101","doi-asserted-by":"crossref","DOI":"10.1016\/j.compbiomed.2023.107037","article-title":"Swin mae: Masked autoencoders for small datasets","volume":"161","author":"Xu","year":"2023","journal-title":"Computers in Biology and Medicine"},{"key":"10.1016\/j.eswa.2026.131462_bib0102","series-title":"Mining biomedical text, images and visual features for information retrieval","first-page":"423","article-title":"Image informatics for clinical and preclinical biomedical analysis","author":"Yadav","year":"2025"},{"key":"10.1016\/j.eswa.2026.131462_bib0103","series-title":"Medical imaging with deep learning-short papers","article-title":"Low-rank adaptation with swin transformers to enhance skin cancer diagnosis","author":"Yadla","year":"2025"},{"key":"10.1016\/j.eswa.2026.131462_bib0104","series-title":"Fourth international conference on computer vision and pattern analysis (ICCPA 2024)","first-page":"70","article-title":"Two-stage self-supervised training vision transformers for small datasets","volume":"13256","author":"Yang","year":"2024"},{"key":"10.1016\/j.eswa.2026.131462_bib0105","article-title":"Vision transformer with masked autoencoders for referable diabetic retinopathy classification based on large-size retina image","volume":"19","author":"Yang","year":"2024","journal-title":"PLoS One"},{"issue":"3","key":"10.1016\/j.eswa.2026.131462_bib0106","first-page":"712","article-title":"Diffmic-v2: Medical image classification via improved diffusion network","volume":"44","author":"Yang","year":"2025","journal-title":"IEEE Transactions on Medical Imaging"},{"key":"10.1016\/j.eswa.2026.131462_bib0107","doi-asserted-by":"crossref","DOI":"10.1142\/S0218348X24500609","article-title":"A novel transformer method pretrained with masked autoencoders and fractal dimension for diabetic retinopathy classification","volume":"32","author":"Yang","year":"2024","journal-title":"Fractals"},{"key":"10.1016\/j.eswa.2026.131462_bib0108","series-title":"International conference on medical image computing and computer-assisted intervention","first-page":"79","article-title":"Adapting pre-trained generative model to medical image for data augmentation","author":"Yuan","year":"2024"},{"key":"10.1016\/j.eswa.2026.131462_bib0109","doi-asserted-by":"crossref","first-page":"88","DOI":"10.1109\/MCOM.002.2300257","article-title":"Transformer masked autoencoders for next-generation wireless communications: Architecture and opportunities","volume":"62","author":"Zayat","year":"2023","journal-title":"IEEE Communications Magazine"},{"key":"10.1016\/j.eswa.2026.131462_bib0110","doi-asserted-by":"crossref","unstructured":"Zhang, C., Zhang, C., Song, J., Yi, J. S. K., Zhang, K., & Kweon, I. S. (2022a). A survey on masked autoencoder for self-supervised learning in vision and beyond. arXiv preprint arXiv:2208.00173.","DOI":"10.24963\/ijcai.2023\/762"},{"key":"10.1016\/j.eswa.2026.131462_bib0111","doi-asserted-by":"crossref","first-page":"2402","DOI":"10.1007\/s10278-023-00894-x","article-title":"Improving image classification of knee radiographs: an automated image labeling approach","volume":"36","author":"Zhang","year":"2023","journal-title":"Journal of Digital Imaging"},{"key":"10.1016\/j.eswa.2026.131462_bib0112","article-title":"Medical image segmentation by combining feature enhancement swin transformer and upernet","volume":"15","author":"Zhang","year":"2025","journal-title":"Scientific Reports"},{"key":"10.1016\/j.eswa.2026.131462_bib0113","series-title":"Advances in neural information processing systems","first-page":"27127","article-title":"How mask matters: Towards theoretical understandings of masked autoencoders","volume":"35","author":"Zhang","year":"2022"},{"key":"10.1016\/j.eswa.2026.131462_bib0114","series-title":"Machine learning in clinical application of medical imaging for lesion detection, segmentation, diagnosis, therapy, and prognosis prediction","author":"Zhang","year":"2020"},{"key":"10.1016\/j.eswa.2026.131462_bib0115","article-title":"From patches to wsis: A systematic review of deep multiple instance learning in computational pathology","volume":"114","author":"Zhang","year":"2025","journal-title":"Information Fusion"},{"key":"10.1016\/j.eswa.2026.131462_bib0116","article-title":"Segment anything model for medical image segmentation: Current applications and future directions","volume":"172","author":"Zhang","year":"2024","journal-title":"Computers in Biology and Medicine"},{"key":"10.1016\/j.eswa.2026.131462_bib0117","series-title":"Dualprompt-medcap: a dual-prompt enhanced approach for medical image captioning","first-page":"205","author":"Zhao","year":"2025"},{"key":"10.1016\/j.eswa.2026.131462_bib0118","series-title":"2023 IEEE 20th international symposium on biomedical imaging (ISBI)","first-page":"1","article-title":"Self pre-training with masked autoencoders for medical image classification and segmentation","author":"Zhou","year":"2023"},{"key":"10.1016\/j.eswa.2026.131462_bib0119","unstructured":"Zhou, Y., Diao, X., Huo, Y., Liu, Y., Fan, X., & Zhao, W. (2023b). Masked transformer for electrocardiogram classification. arXiv preprint arXiv:2309.07136."},{"key":"10.1016\/j.eswa.2026.131462_bib0120","series-title":"2024 IEEE international symposium on biomedical imaging (ISBI)","first-page":"1","article-title":"Melo: Low-rank adaptation is better than fine-tuning for medical image diagnosis","author":"Zhu","year":"2024"},{"key":"10.1016\/j.eswa.2026.131462_bib0121","doi-asserted-by":"crossref","unstructured":"Zhu, Z., Lu, S.-Y., Huang, T., Liu, L., & Liu, Z. (2025). Lka: Large kernel adapter for enhanced medical image classification. arXiv preprint arXiv:2506.19118.","DOI":"10.1007\/978-3-032-04978-0_38"},{"issue":"1","key":"10.1016\/j.eswa.2026.131462_bib0122","first-page":"104","article-title":"Mim: Mask in mask self-supervised pre-training for 3d medical image analysis","volume":"44","author":"Zhuang","year":"2025","journal-title":"IEEE Transactions on Medical Imaging"}],"container-title":["Expert Systems with Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417426003751?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417426003751?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,2,14]],"date-time":"2026-02-14T13:13:26Z","timestamp":1771074806000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0957417426003751"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6]]},"references-count":122,"alternative-id":["S0957417426003751"],"URL":"https:\/\/doi.org\/10.1016\/j.eswa.2026.131462","relation":{},"ISSN":["0957-4174"],"issn-type":[{"value":"0957-4174","type":"print"}],"subject":[],"published":{"date-parts":[[2026,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Transformer-masked autoencoder (MAE) for robust medical image classification: A comprehensive survey","name":"articletitle","label":"Article Title"},{"value":"Expert Systems with Applications","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.eswa.2026.131462","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"131462"}}