{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,28]],"date-time":"2026-02-28T02:57:11Z","timestamp":1772247431827,"version":"3.50.1"},"reference-count":118,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T00:00:00Z","timestamp":1763942400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2026,1,7]],"date-time":"2026-01-07T00:00:00Z","timestamp":1767744000000},"content-version":"vor","delay-in-days":44,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"DOI":"10.13039\/501100002701","name":"Ministry of Education","doi-asserted-by":"publisher","award":["2020R1A6A1A03038540"],"award-info":[{"award-number":["2020R1A6A1A03038540"]}],"id":[{"id":"10.13039\/501100002701","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100014188","name":"Ministry of Science and ICT, South Korea","doi-asserted-by":"publisher","award":["No.RS-2019-II190231"],"award-info":[{"award-number":["No.RS-2019-II190231"]}],"id":[{"id":"10.13039\/501100014188","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100014188","name":"Ministry of Science and ICT, South Korea","doi-asserted-by":"publisher","award":["RS-2025-25443732"],"award-info":[{"award-number":["RS-2025-25443732"]}],"id":[{"id":"10.13039\/501100014188","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Information Technology Research Center, South Korea","award":["IITP-2022-RS-2022-00156354"],"award-info":[{"award-number":["IITP-2022-RS-2022-00156354"]}]},{"name":"National Program for Excellence in SW","award":["2024-0-00037"],"award-info":[{"award-number":["2024-0-00037"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Artif Intell Rev"],"DOI":"10.1007\/s10462-025-11450-0","type":"journal-article","created":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T14:45:37Z","timestamp":1763995537000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Revisiting U-Net: a foundational backbone for modern generative AI"],"prefix":"10.1007","volume":"59","author":[{"given":"Marvin John","family":"Ignacio","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sangyun","family":"Shin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hulin","family":"Jin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Seong Joon","family":"Yoo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dongil","family":"Han","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yong-Guk","family":"Kim","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,11,24]]},"reference":[{"key":"11450_CR1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ejrad.2024.111389","volume":"173","author":"J Abrantes","year":"2024","unstructured":"Abrantes J, Rouzrokh P (2024) Explaining explainability: the role of XAI in medical imaging. Eur J Radiol 173:111389","journal-title":"Eur J Radiol"},{"issue":"1","key":"11450_CR2","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1186\/s42492-023-00140-9","volume":"6","author":"K Al-hammuri","year":"2023","unstructured":"Al-hammuri K, Gebali F, Kanan A, Chelvan IT (2023) Vision transformer architecture and applications in digital health: a tutorial and survey. Vis Comput Ind Biomed Art 6(1):14","journal-title":"Vis Comput Ind Biomed Art"},{"key":"11450_CR3","unstructured":"Ansar W, Goswami S, Chakrabarti A (2024) A survey on transformers in NLP with focus on efficiency. arXiv:2406.16893"},{"key":"11450_CR4","unstructured":"An J, Zhang S, Yang H, Gupta S, Huang J-B, Luo J, Yin X (2023) Latent-shift: latent diffusion with temporal shift for efficient text-to-video generation. arXiv preprint arXiv:2304.08477"},{"key":"11450_CR5","doi-asserted-by":"publisher","first-page":"557","DOI":"10.1016\/j.micpro.2011.05.007","volume":"35","author":"A Armato","year":"2011","unstructured":"Armato A, Fanucci L, Scilingo EP, Rossi DD (2011) Low-error digital hardware implementation of artificial neuron activation functions and their derivative. Microprocess Microsyst 35:557\u2013567","journal-title":"Microprocess Microsyst"},{"issue":"4","key":"11450_CR6","doi-asserted-by":"publisher","first-page":"301","DOI":"10.1007\/s42979-021-00702-9","volume":"2","author":"A Asperti","year":"2021","unstructured":"Asperti A, Evangelista D, Piccolomini EL (2021) A survey on variational autoencoders from a green ai perspective. SN Comput Sci 2(4):301","journal-title":"SN Comput Sci"},{"key":"11450_CR7","unstructured":"Ba JL, Kiros JR, Hinton GE (2016) Layer normalization. arXiv preprint arXiv:1607.06450"},{"key":"11450_CR8","doi-asserted-by":"crossref","unstructured":"Bar-Tal O, Chefer H, Tov O, Herrmann C, Paiss R, Zada S, Ephrat A, Hur J, Liu G, Raj A, Li Y, Rubinstein M, Michaeli T, Wang O, Sun D, Dekel T, Mosseri I (2024) Lumiere: a space-time diffusion model for video generation. In: SIGGRAPH Asia 2024 conference papers, pp 1\u201311","DOI":"10.1145\/3680528.3687614"},{"key":"11450_CR9","unstructured":"Brown TB, Mann B, Ryder N, Subbiah M, Kaplan J, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A, Agarwal S, Herbert-Voss A, Krueger G, Henighan T, Child R, Ramesh A, Ziegler DM, Wu J, Winter C, Hesse C, Chen M, Sigler E, Litwin M-T, Gray S, Chess B, Clark J, Berner C, McCandlish S, Radford A, Sutskever I, Amodei D (2020) Language models are few-shot learners. arXiv:2005.14165"},{"key":"11450_CR10","doi-asserted-by":"crossref","unstructured":"Bulat A, Kossaifi J, Tzimiropoulos G, Pantic M (2020) Toward fast and accurate human pose estimation via soft-gated skip connections. In: 2020 15th IEEE international conference on automatic face and gesture recognition (FG 2020), pp 8\u201315","DOI":"10.1109\/FG47880.2020.00014"},{"key":"11450_CR11","doi-asserted-by":"publisher","first-page":"2814","DOI":"10.1109\/TKDE.2024.3361474","volume":"36","author":"H Cao","year":"2022","unstructured":"Cao H, Tan C, Gao Z, Xu Y, Chen G, Heng P-A, Li SZ (2022) A survey on generative diffusion models. IEEE Trans Knowl Data Eng 36:2814\u20132830","journal-title":"IEEE Trans Knowl Data Eng"},{"key":"11450_CR12","unstructured":"Cao Y, Li S, Liu Y, Yan Z, Dai Y, Yu PS, Sun L (2023) A comprehensive survey of AI-generated content (AIGC): a history of generative AI from GAN to ChatGPT. arXiv:2303.04226"},{"key":"11450_CR13","first-page":"15084","volume":"34","author":"L Chen","year":"2021","unstructured":"Chen L, Lu K, Rajeswaran A, Lee K, Grover A, Laskin M, Abbeel P, Srinivas A, Mordatch I (2021) Decision transformer: reinforcement learning via sequence modeling. Neural Inf Process Syst 34:15084\u201315097","journal-title":"Neural Inf Process Syst"},{"key":"11450_CR14","doi-asserted-by":"publisher","first-page":"10558","DOI":"10.1109\/TPAMI.2024.3447085","volume":"46","author":"H Cheng","year":"2023","unstructured":"Cheng H, Zhang M, Shi JQ (2023) A survey on deep neural network pruning: taxonomy, comparison, analysis, and recommendations. IEEE Trans Pattern Anal Mach Intell 46:10558\u201310578","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"11450_CR15","doi-asserted-by":"crossref","unstructured":"Chen W, Liu B, Peng S, Sun J, Qiao X (2018) S3D-UNet: separable 3D U-Net for brain tumor segmentation. In: BrainLes@MICCAI","DOI":"10.1007\/978-3-030-11726-9_32"},{"key":"11450_CR16","doi-asserted-by":"crossref","unstructured":"Chen Z, Zhuang J, Liang X, Lin L (2019) Blending-target domain adaptation by adversarial meta-adaptation networks. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 2248\u20132257","DOI":"10.1109\/CVPR.2019.00235"},{"key":"11450_CR17","doi-asserted-by":"crossref","unstructured":"Chollet F (2016) Xception: deep learning with depthwise separable convolutions. In: 2017 IEEE conference on computer vision and pattern recognition (CVPR), pp 1800\u20131807","DOI":"10.1109\/CVPR.2017.195"},{"key":"11450_CR18","doi-asserted-by":"crossref","unstructured":"Chowdhury P, Srivastava G (2024) Enhanced classification and segmentation of brain tumors in MRI images using custom CNN and U-Net models with XAI. In: International conference on pattern recognition. Springer, pp 1\u201316","DOI":"10.1007\/978-3-031-78104-9_1"},{"key":"11450_CR19","doi-asserted-by":"crossref","unstructured":"\u00c7i\u00e7ek \u00d6, Abdulkadir A, Lienkamp SS, Brox T, Ronneberger O (2016) 3D U-Net: learning dense volumetric segmentation from sparse annotation. In: International conference on medical image computing and computer-assisted intervention. Springer, pp 424\u2013432","DOI":"10.1007\/978-3-319-46723-8_49"},{"key":"11450_CR20","doi-asserted-by":"crossref","unstructured":"Corona E, Zanfir A, Bazavan EG, Kolotouros N, Alldieck T, Sminchisescu C (2024) Vlogger: multimodal diffusion for embodied avatar synthesis. In: 2025 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp 15896\u201315908","DOI":"10.1109\/CVPR52734.2025.01482"},{"key":"11450_CR21","unstructured":"DeepSeek-AI (2024) DeepSeek LLM: scaling open-source language models with longtermism. arXiv:2401.02954"},{"key":"11450_CR22","unstructured":"Devlin J, Chang M-W, Lee K, Toutanova K (2019) Bert: pre-training of deep bidirectional transformers for language understanding. In: North American Chapter of the Association for Computational Linguistics"},{"key":"11450_CR23","unstructured":"Donahue D, Lialin V, Rumshisky A (2019) Injecting hierarchy with u-net transformers. arXiv preprint arXiv:1910.10488"},{"key":"11450_CR24","doi-asserted-by":"crossref","unstructured":"Esser P, Sutter E, Ommer B (2018) A variational U-Net for conditional appearance and shape generation. In: 2018 IEEE\/CVF conference on computer vision and pattern recognition, pp 8857\u20138866","DOI":"10.1109\/CVPR.2018.00923"},{"key":"11450_CR25","unstructured":"Evans Z, Carr C, Taylor J, Hawley SH, Pons J (2024) Fast timing-conditioned latent audio diffusion. arXiv:2402.04825"},{"key":"11450_CR26","doi-asserted-by":"publisher","first-page":"906290","DOI":"10.3389\/fnins.2022.906290","volume":"16","author":"FV Farahani","year":"2022","unstructured":"Farahani FV, Fiok K, Lahijanian B, Karwowski W, Douglas PK (2022) Explainable AI: a review of applications to neuroimaging data. Front Neurosci 16:906290","journal-title":"Front Neurosci"},{"key":"11450_CR27","doi-asserted-by":"crossref","unstructured":"Giri R, Isik U, Krishnaswamy A (2019) Attention wave-U-Net for speech enhancement. In: 2019 IEEE workshop on applications of signal processing to audio and acoustics (WASPAA). IEEE, pp 249\u2013253","DOI":"10.1109\/WASPAA.2019.8937186"},{"issue":"8","key":"11450_CR28","doi-asserted-by":"publisher","first-page":"801","DOI":"10.3844\/jcssp.2024.801.818","volume":"20","author":"R Gozalo-Brizuela","year":"2024","unstructured":"Gozalo-Brizuela R, Garrido Merchan E (2024) A survey of generative AI applications. J Comput Sci 20(8):801\u2013818. https:\/\/doi.org\/10.3844\/jcssp.2024.801.818","journal-title":"J Comput Sci"},{"key":"11450_CR29","unstructured":"Gu A, Dao T (2023) Mamba: linear-time sequence modeling with selective state spaces. arXiv:2312.00752"},{"key":"11450_CR30","doi-asserted-by":"publisher","first-page":"144456","DOI":"10.1109\/ACCESS.2023.3344813","volume":"11","author":"S Gul","year":"2023","unstructured":"Gul S, Khan MS (2023) A survey of audio enhancement algorithms for music, speech, bioacoustics, biomedical, industrial, and environmental sounds by image u-net. IEEE Access 11:144456\u2013144483","journal-title":"IEEE Access"},{"key":"11450_CR31","unstructured":"Ha JS (2024) Video3D: single image to 3D using video diffusion and volume renderer. https:\/\/cs.brown.edu\/media\/filer_public\/aa\/4a\/aa4a6cdf-6edf-4f1f-b628-7496e576504f\/junsukha.pdf"},{"key":"11450_CR32","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/TPAMI.2020.3033882","volume":"45","author":"K Han","year":"2020","unstructured":"Han K, Wang Y, Chen H, Chen X, Guo J, Liu Z, Tang Y, Xiao A, Xu C, Xu Y, Yang Z, Zhang Y, Tao D (2020) A survey on vision transformer. IEEE Trans Pattern Anal Machine Intell 45:1","journal-title":"IEEE Trans Pattern Anal Machine Intell"},{"key":"11450_CR33","doi-asserted-by":"crossref","unstructured":"Hao X, Su X, Wang Z, Zhang H, Batushiren (2020) UnetGAN: a robust speech enhancement approach in time domain for extremely low signal-to-noise ratio condition. arXiv preprint arXiv:2010.15521","DOI":"10.21437\/Interspeech.2019-1567"},{"key":"11450_CR34","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"11450_CR35","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh M, Young P, Hockenmaier J (2013) Framing image description as a ranking task: data, models and evaluation metrics. J Artif Intell Res 47:853\u2013899","journal-title":"J Artif Intell Res"},{"issue":"11","key":"11450_CR36","doi-asserted-by":"publisher","first-page":"1423","DOI":"10.3390\/sym16111423","volume":"16","author":"Y Hou","year":"2024","unstructured":"Hou Y, Ma X, Zhang J, Guo C (2024) Symmetric connected U-Net with multi-head self attention (MHSA) and WGAN for image inpainting. Symmetry 16(11):1423","journal-title":"Symmetry"},{"key":"11450_CR37","unstructured":"Howard AG, Zhu M, Chen B, Kalenichenko D, Wang W, Weyand T, Andreetto M, Adam H (2017) Mobilenets: efficient convolutional neural networks for mobile vision applications. arXiv preprint arXiv:1704.04861"},{"key":"11450_CR38","unstructured":"Huang Y, Xu J, Jiang Z, Lai J, Li Z, Yao Y, Chen T, Yang L, Xin Z, Ma X (2023) Advancing transformer architecture in long-context large language models: a comprehensive survey. arXiv:2311.12351"},{"key":"11450_CR39","unstructured":"Iandola FN, Moskewicz MW, Ashraf K, Han S, Dally WJ, Keutzer K (2016) Squeezenet: alexnet-level accuracy with 50x fewer parameters and $$<$$1mb model size. arXiv:1602.07360"},{"key":"11450_CR40","doi-asserted-by":"publisher","first-page":"74","DOI":"10.1016\/j.neunet.2019.08.025","volume":"121","author":"N Ibtehaz","year":"2020","unstructured":"Ibtehaz N, Rahman MS (2020) MultiresUNet: rethinking the U-Net architecture for multimodal biomedical image segmentation. Neural Netw 121:74\u201387","journal-title":"Neural Netw"},{"key":"11450_CR41","doi-asserted-by":"publisher","first-page":"100553","DOI":"10.1016\/j.cosrev.2023.100553","volume":"48","author":"G Iglesias","year":"2022","unstructured":"Iglesias G, Talavera E, D\u00edaz-\u00c1lvarez A (2022) A survey on GANs for computer vision: recent research, analysis and taxonomy. Comput Sci Rev 48:100553 arXiv:abs\/2203.11242","journal-title":"Comput Sci Rev"},{"key":"11450_CR42","doi-asserted-by":"crossref","unstructured":"Ignacio MJ, Kim Y-G, Jin H, Yu S (2025) U-Net encapsulated transformer for reducing dimensionality in training large language models. https:\/\/github.com\/ignaciomarvinjohn\/uetlm. Accessed 2025","DOI":"10.1145\/3735653"},{"key":"11450_CR43","unstructured":"Ignacio MJ, Nguyen TT, Jin H, Kim Y-G (2024) Meme analysis using LLM-based contextual information and u-net encapsulated transformer. IEEE Access"},{"key":"11450_CR44","unstructured":"Ioffe S, Szegedy C (2015) Batch normalization: accelerating deep network training by reducing internal covariate shift. In: International conference on machine learning. PMLR, pp 448\u2013456"},{"key":"11450_CR45","doi-asserted-by":"crossref","unstructured":"Isola P, Zhu J-Y, Zhou T, Efros AA (2017) Image-to-image translation with conditional adversarial networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1125\u20131134","DOI":"10.1109\/CVPR.2017.632"},{"key":"11450_CR46","unstructured":"Jansson A, Humphrey EJ, Montecchio N, Bittner R, Kumar A, Weyde T (2017) Singing voice separation with deep u-net convolutional networks. In: 18th international society for music information retrieval conference"},{"key":"11450_CR47","unstructured":"Jiang Y, Tang H, Chang J-HR, Song L, Wang Z, Cao L (2023) Efficient-3dim: learning a generalizable single-image novel-view synthesizer in one day. arXiv:2310.03015"},{"key":"11450_CR48","doi-asserted-by":"crossref","unstructured":"Karras JS, Holynski A, Wang T-C, Kemelmacher-Shlizerman I (2023) Dreampose: fashion image-to-video synthesis via stable diffusion. In: 2023 IEEE\/CVF international conference on computer vision (ICCV), pp 22623\u201322633","DOI":"10.1109\/ICCV51070.2023.02073"},{"key":"11450_CR49","doi-asserted-by":"crossref","unstructured":"Kaur A, Dong G, Basu A (2022) Gradxcepunet: explainable ai based medical image segmentation. In: International conference on smart multimedia. Springer, pp 174\u2013188","DOI":"10.1007\/978-3-031-22061-6_13"},{"key":"11450_CR50","doi-asserted-by":"crossref","unstructured":"Kerdvibulvech C (2025) Multimodal AI model for zero-shot vehicle brand identification. Multimedia Tools Appl 1\u201320","DOI":"10.1007\/s11042-024-20559-3"},{"issue":"10","key":"11450_CR51","doi-asserted-by":"publisher","first-page":"285","DOI":"10.1007\/s10462-024-10922-z","volume":"57","author":"AG Khoee","year":"2024","unstructured":"Khoee AG, Yu Y, Feldt R (2024) Domain generalization through meta-learning: a survey. Artif Intell Rev 57(10):285","journal-title":"Artif Intell Rev"},{"key":"11450_CR52","doi-asserted-by":"publisher","first-page":"179891","DOI":"10.1109\/ACCESS.2024.3502628","volume":"12","author":"T-S Kim","year":"2024","unstructured":"Kim T-S, Ignacio MJ, Yu S, Jin H, Kim Y-G (2024) UI\/UX for generative AI: taxonomy, trend, and challenge. IEEE Access 12:179891\u2013179911","journal-title":"IEEE Access"},{"key":"11450_CR53","doi-asserted-by":"publisher","first-page":"111","DOI":"10.1016\/j.aiopen.2022.10.001","volume":"3","author":"T Lin","year":"2021","unstructured":"Lin T, Wang Y, Liu X, Qiu X (2021) A survey of transformers. AI Open 3:111\u2013132","journal-title":"AI Open"},{"key":"11450_CR54","unstructured":"Liu H, Chen Z, Yuan Y, Mei X, Liu X, Mandic DP, Wang W, Plumbley MD (2023) Audioldm: text-to-audio generation with latent diffusion models. In: International conference on machine learning"},{"key":"11450_CR55","doi-asserted-by":"crossref","unstructured":"Li D, Yang Y, Song Y-Z, Hospedales T (2018) Learning to generalize: meta-learning for domain generalization. In: Proceedings of the AAAI conference on artificial intelligence, vol 32","DOI":"10.1609\/aaai.v32i1.11596"},{"key":"11450_CR56","unstructured":"Macartney C, Weyde T (2018) Improved speech enhancement with the wave-U-Net. arXiv preprint arXiv:1811.11307"},{"key":"11450_CR57","doi-asserted-by":"crossref","unstructured":"Ma L, Jia X, Sun Q, Schiele B, Tuytelaars T, Gool LV (2017) Pose guided person image generation. arXiv:1705.09368","DOI":"10.1109\/CVPR.2018.00018"},{"key":"11450_CR58","doi-asserted-by":"crossref","unstructured":"Majumder N, Hung C-Y, Ghosal D, Hsu W-N, Mihalcea R, Poria S (2024) Tango 2: aligning diffusion-based text-to-audio generations through direct preference optimization. In: ACM multimedia","DOI":"10.1145\/3664647.3681688"},{"key":"11450_CR59","unstructured":"Ma J, Li F, Wang B (2024) U-mamba: enhancing long-range dependency for biomedical image segmentation. arXiv:2401.04722"},{"key":"11450_CR60","first-page":"1","volume":"10","author":"IS Na","year":"2020","unstructured":"Na IS, Tran CD, Nguyen D, Dinh SV (2020) Facial UV map completion for pose-invariant face recognition: a novel adversarial approach based on coupled attention residual UNets. HCIS 10:1\u201317","journal-title":"HCIS"},{"key":"11450_CR61","unstructured":"Nagel M, Fournarakis M, Amjad RA, Bondarenko Y, Baalen M, Blankevoort T (2021) A white paper on neural network quantization. arXiv:2106.08295"},{"key":"11450_CR62","unstructured":"Nair V, Hinton GE (2010) Rectified linear units improve restricted Boltzmann machines. In: Proceedings of the 27th international conference on machine learning (ICML-10), pp 807\u2013814"},{"key":"11450_CR63","unstructured":"Nguyen HTT, Cao HQ, Nguyen KVT, Pham NDK (2021) Evaluation of explainable artificial intelligence: shap, lime, and cam. In: Proceedings of the FPT AI conference, pp 1\u20136"},{"key":"11450_CR64","doi-asserted-by":"publisher","first-page":"10142","DOI":"10.1109\/TITS.2021.3122865","volume":"23","author":"D Omeiza","year":"2021","unstructured":"Omeiza D, Webb H, Jirotka M, Kunze L (2021) Explanations in autonomous driving: a survey. IEEE Trans Intell Transp Syst 23:10142\u201310162","journal-title":"IEEE Trans Intell Transp Syst"},{"key":"11450_CR65","unstructured":"Oord A, Kalchbrenner N, Espeholt L, Vinyals O, Graves A, Kavukcuoglu K (2016) Conditional image generation with pixelcnn decoders. Adv Neural Inf Process Syst 29"},{"key":"11450_CR66","unstructured":"OpenAI (2023) ChatGPT (Mar 14 version). Large language model. https:\/\/chat.openai.com\/chat"},{"key":"11450_CR67","unstructured":"Poole B, Jain A, Barron JT, Mildenhall B (2022) Dreamfusion: text-to-3D using 2D diffusion. arXiv:2209.14988"},{"key":"11450_CR68","doi-asserted-by":"publisher","first-page":"5845","DOI":"10.1007\/s10462-022-10152-1","volume":"55","author":"NS Punn","year":"2021","unstructured":"Punn NS, Agarwal S (2021) Modality specific U-Net variants for biomedical image segmentation: a survey. Artif Intell Rev 55:5845\u20135889","journal-title":"Artif Intell Rev"},{"key":"11450_CR69","doi-asserted-by":"crossref","unstructured":"Qin HX, Hui P (2023) Empowering the metaverse with generative AI: survey and future directions. In: 2023 IEEE 43rd international conference on distributed computing systems workshops (ICDCSW), pp 85\u201390","DOI":"10.1109\/ICDCSW60045.2023.00022"},{"key":"11450_CR70","unstructured":"Radford A, Kim JW, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J et al (2021) Learning transferable visual models from natural language supervision. In: International conference on machine learning. PMLR, pp 8748\u20138763"},{"key":"11450_CR71","unstructured":"Radford A, Narasimhan K, Salimans T, Sutskever I (2018) Improving language understanding by generative pre-training. https:\/\/openai.com\/blog\/language-unsupervised\/"},{"key":"11450_CR72","doi-asserted-by":"crossref","unstructured":"Rahmon G, Bunyak F, Seetharaman G, Palaniappan K (2021) Motion U-Net: multi-cue encoder-decoder network for motion segmentation. In: 2020 25th international conference on pattern recognition (ICPR), pp 8125\u20138132","DOI":"10.1109\/ICPR48806.2021.9413211"},{"key":"11450_CR73","unstructured":"Ramachandran P, Zoph B, Le QV (2018) Searching for activation functions. arXiv:1710.05941"},{"key":"11450_CR74","unstructured":"Raut G, Singh A (2024) Generative ai in vision: a survey on models, metrics and applications. arXiv:2402.16369"},{"key":"11450_CR75","doi-asserted-by":"publisher","DOI":"10.1016\/j.bspc.2022.103861","volume":"79","author":"R Raza","year":"2022","unstructured":"Raza R, Bajwa UI, Mehmood Y, Anwar MW, Jamal MH (2022) dResU-Net: 3D deep residual u-net based brain tumor segmentation from multimodal MRI. Biomed Signal Process Control 79:103861","journal-title":"Biomed Signal Process Control"},{"key":"11450_CR76","doi-asserted-by":"crossref","unstructured":"Redko I, Morvant E, Habrard A, Sebban M, Bennani Y (2020) A survey on domain adaptation theory: learning bounds and theoretical guarantees. arXiv preprint arXiv:2004.11829","DOI":"10.1016\/B978-1-78548-236-6.50002-7"},{"key":"11450_CR77","doi-asserted-by":"crossref","unstructured":"Rombach R, Blattmann A, Lorenz D, Esser P, Ommer B (2022) High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10684\u201310695","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"11450_CR78","doi-asserted-by":"crossref","unstructured":"Ronneberger O, Fischer P, Brox T (2015) U-Net: convolutional networks for biomedical image segmentation. In: Medical image computing and computer-assisted intervention\u2014MICCAI 2015: 18th international conference, Munich, Germany, October 5\u20139, 2015, Proceedings, Part III 18, Springer, pp 234\u2013241","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"11450_CR79","doi-asserted-by":"crossref","unstructured":"Ruan J, Xiang S (2024) VM-UNet: vision mamba UNet for medical image segmentation. arXiv:2402.02491","DOI":"10.1145\/3767748"},{"key":"11450_CR80","doi-asserted-by":"crossref","unstructured":"Ruiz N, Li Y, Jampani V, Pritch Y, Rubinstein M, Aberman K (2022) Dreambooth: fine tuning text-to-image diffusion models for subject-driven generation. In: 2023 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp 22500\u201322510","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"11450_CR81","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia C, Chan W, Saxena S, Li L, Whang J, Denton EL, Ghasemipour K, Gontijo Lopes R, Karagol Ayan B, Salimans T et al (2022) Photorealistic text-to-image diffusion models with deep language understanding. Adv Neural Inf Process Syst 35:36479\u201336494","journal-title":"Adv Neural Inf Process Syst"},{"key":"11450_CR82","doi-asserted-by":"publisher","first-page":"219","DOI":"10.1080\/01691864.2022.2029720","volume":"36","author":"T Sakai","year":"2021","unstructured":"Sakai T, Nagai T (2021) Explainable autonomous robots: a survey and perspective. Adv Robot 36:219\u2013238","journal-title":"Adv Robot"},{"key":"11450_CR83","unstructured":"Salimans T, Karpathy A, Chen X, Kingma DP (2017) Pixelcnn++: improving the pixelcnn with discretized logistic mixture likelihood and other modifications. arXiv preprint arXiv:1701.05517"},{"issue":"9","key":"11450_CR84","doi-asserted-by":"publisher","first-page":"2132","DOI":"10.3390\/diagnostics12092132","volume":"12","author":"N Sharma","year":"2022","unstructured":"Sharma N, Saba L, Khanna NN, Kalra MK, Fouda MM, Suri JS (2022) Segmentation-based classification deep learning model embedded with explainable AI for COVID-19 detection in chest X-ray scans. Diagnostics 12(9):2132","journal-title":"Diagnostics"},{"issue":"23","key":"11450_CR85","doi-asserted-by":"publisher","first-page":"5496","DOI":"10.3390\/cancers15235496","volume":"15","author":"Y Shi","year":"2023","unstructured":"Shi Y, Tang H, Baine M, Hollingsworth MA, Du H, Zheng D, Zhang C, Yu H (2023) 3DGAUnet: 3D generative adversarial networks with a 3D U-Net based generator to achieve the accurate and effective synthesis of clinical tumor image data for pancreatic cancer. Cancers 15(23):5496","journal-title":"Cancers"},{"key":"11450_CR86","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2019.113064","volume":"143","author":"S-Y Shin","year":"2020","unstructured":"Shin S-Y, Kang Y-W, Kim Y-G (2020) Reward-driven U-Net training for obstacle avoidance drone. Expert Syst Appl 143:113064. https:\/\/doi.org\/10.1016\/j.eswa.2019.113064","journal-title":"Expert Syst Appl"},{"key":"11450_CR87","doi-asserted-by":"crossref","unstructured":"Si C, Huang Z, Jiang Y, Liu Z (2023) Freeu: free lunch in diffusion u-net. In: 2024 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp 4733\u20134743","DOI":"10.1109\/CVPR52733.2024.00453"},{"key":"11450_CR88","doi-asserted-by":"crossref","unstructured":"Song CH, Han HJ, Avrithis Y (2021) All the attention you need: global-local, spatial-channel attention for image retrieval. In: 2022 IEEE\/CVF winter conference on applications of computer vision (WACV), pp 439\u2013448","DOI":"10.1109\/WACV51458.2022.00051"},{"key":"11450_CR89","unstructured":"Sun F, Li L, Qiu X, Liu Y (2018) U-Net: machine reading comprehension with unanswerable questions. arXiv preprint arXiv:1810.06638"},{"key":"11450_CR90","unstructured":"tamasino52 (2021) UNETR-Pose. GitHub"},{"key":"11450_CR91","doi-asserted-by":"crossref","unstructured":"Tang D, Chen Z (2024) English long sentence machine translation algorithm based on u-net generation adversarial network. In: 2024 second international conference on data science and information system (ICDSIS), pp 1\u20135","DOI":"10.1109\/ICDSIS61070.2024.10594266"},{"key":"11450_CR92","unstructured":"Tan M, Le QV (2019) EfficientNet: rethinking model scaling for convolutional neural networks. arXiv:1905.11946"},{"key":"11450_CR93","unstructured":"Team G (2024) Gemini: a family of highly capable multimodal models. arXiv:2312.11805"},{"key":"11450_CR94","unstructured":"Ulyanov D, Vedaldi A, Lempitsky V (2016) Instance normalization: the missing ingredient for fast stylization. arXiv preprint arXiv:1607.08022"},{"key":"11450_CR95","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.102075","volume":"103","author":"S Umirzakova","year":"2024","unstructured":"Umirzakova S, Ahmad S, Khan LU, Whangbo T (2024) Medical image super-resolution for smart healthcare applications: a comprehensive survey. Inf Fusion 103:102075","journal-title":"Inf Fusion"},{"key":"11450_CR96","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. Adv Neural Inf Process Syst 30"},{"key":"11450_CR97","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2022.102470","volume":"79","author":"BHM Velden","year":"2021","unstructured":"Velden BHM, Kuijf HJ, Gilhuijs KGA, Viergever MA (2021) Explainable artificial intelligence (XAI) in deep learning-based medical image analysis. Med Image Anal 79:102470","journal-title":"Med Image Anal"},{"key":"11450_CR98","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2024.124781","volume":"255","author":"J Wang","year":"2024","unstructured":"Wang J, Ignacio MJ, Yu S, Jin H, Kim Y-G (2024) UET4Rec: U-Net encapsulated transformer for sequential recommender. Expert Syst Appl 255:124781","journal-title":"Expert Syst Appl"},{"key":"11450_CR99","doi-asserted-by":"crossref","unstructured":"Wang Z, Zheng J-Q, Zhang Y, Cui G, Li L (2024) Mamba-UNet: UNet-like pure visual mamba for medical image segmentation. arXiv:2402.05079","DOI":"10.2139\/ssrn.5097998"},{"key":"11450_CR100","doi-asserted-by":"crossref","unstructured":"Wu D-Y, Chen Y-H, Lee H-Y (2020) Vqvc+: one-shot voice conversion by vector quantization and u-net architecture. arXiv preprint arXiv:2006.04154","DOI":"10.21437\/Interspeech.2020-1443"},{"key":"11450_CR101","doi-asserted-by":"crossref","unstructured":"Wu Y, He K (2018) Group normalization. In: Proceedings of the European conference on computer vision (ECCV), pp 3\u201319","DOI":"10.1007\/978-3-030-01261-8_1"},{"key":"11450_CR102","doi-asserted-by":"crossref","unstructured":"Wu J, Liu W, Li C, Jiang T, Shariful IM, Sun H, Li X, Li X, Huang X, Grzegorzek M (2022) A state-of-the-art survey of u-net in microscopic image analysis: from simple usage to structure mortification. arXiv:2202.06465","DOI":"10.1007\/s00521-023-09284-4"},{"key":"11450_CR103","unstructured":"Wu L, Xia Y, Zhao L, Tian F, Qin T, Lai J, Liu T-Y (2017) Adversarial neural machine translation. arXiv:abs\/1704.06933"},{"key":"11450_CR104","doi-asserted-by":"crossref","unstructured":"Xie S, Zhang Z, Lin Z, Hinz T, Zhang K (2022) Smartbrush: text and shape guided object inpainting with diffusion model. In: 2023 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp 22428\u201322437","DOI":"10.1109\/CVPR52729.2023.02148"},{"key":"11450_CR105","doi-asserted-by":"crossref","unstructured":"Xing Z, Ye T, Yang Y, Liu G, Zhu L (2024) Segmamba: long-range sequential modeling mamba for 3D medical image segmentation. In: International conference on medical image computing and computer-assisted intervention","DOI":"10.1007\/978-3-031-72111-3_54"},{"key":"11450_CR106","unstructured":"Xu B, Wang N, Chen T, Li M (2015) Empirical evaluation of rectified activations in convolutional network. arXiv preprint arXiv:1505.00853"},{"key":"11450_CR107","unstructured":"Yang Z, Chen W, Wang F, Xu B (2017) Improving neural machine translation with conditional sequence generative adversarial nets. arXiv:1703.04887"},{"key":"11450_CR108","doi-asserted-by":"crossref","unstructured":"Yan Z, Li X, Li M, Zuo W, Shan S (2018) Shift-net: image inpainting via deep feature rearrangement. In: European conference on computer vision","DOI":"10.1007\/978-3-030-01264-9_1"},{"issue":"3","key":"11450_CR109","doi-asserted-by":"publisher","first-page":"856","DOI":"10.3390\/s25030856","volume":"25","author":"DJ Yeong","year":"2025","unstructured":"Yeong DJ, Panduru K, Walsh J (2025) Exploring the unseen: a survey of multi-sensor fusion and the role of explainable AI (XAI) in autonomous vehicles. Sensors 25(3):856","journal-title":"Sensors"},{"key":"11450_CR110","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107580","volume":"110","author":"Y Yin","year":"2021","unstructured":"Yin Y, Xu D, Wang X, Zhang L (2021) AGUNet: annotation-guided U-Net for fast one-shot video object segmentation. Pattern Recognit 110:107580","journal-title":"Pattern Recognit"},{"key":"11450_CR111","doi-asserted-by":"crossref","unstructured":"Zeng F, Gan W, Wang Y, Yu PS (2023) Distributed training of large language models. In: 2023 IEEE 29th international conference on parallel and distributed systems (ICPADS), pp 840\u2013847","DOI":"10.1109\/ICPADS60453.2023.00126"},{"issue":"5","key":"11450_CR112","doi-asserted-by":"publisher","first-page":"749","DOI":"10.1109\/LGRS.2018.2802944","volume":"15","author":"Z Zhang","year":"2018","unstructured":"Zhang Z, Liu Q, Wang Y (2018) Road extraction by deep residual u-net. IEEE Geosci Remote Sens Lett 15(5):749\u2013753","journal-title":"IEEE Geosci Remote Sens Lett"},{"key":"11450_CR113","doi-asserted-by":"publisher","first-page":"2226","DOI":"10.1109\/TCSVT.2021.3089724","volume":"32","author":"L Zhang","year":"2022","unstructured":"Zhang L, Yang H, Qiu T, Li L (2022) AP-GAN: improving attribute preservation in video face swapping. IEEE Trans Circuits Syst Video Technol 32:2226\u20132237","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"11450_CR114","doi-asserted-by":"crossref","unstructured":"Zhang L, Rao A, Agrawala M (2023) Adding conditional control to text-to-image diffusion models. In: 2023 IEEE\/CVF international conference on computer vision (ICCV), pp 3813\u20133824","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"11450_CR115","doi-asserted-by":"crossref","unstructured":"Zhang X, Zhou X, Lin M, Sun J (2017) Shufflenet: an extremely efficient convolutional neural network for mobile devices. In: 2018 IEEE\/CVF conference on computer vision and pattern recognition, pp 6848\u20136856","DOI":"10.1109\/CVPR.2018.00716"},{"key":"11450_CR116","doi-asserted-by":"crossref","unstructured":"Zhou P, Wang L, Liu Z, Hao Y, Hui P, Tarkoma S, Kangasharju J (2024) A survey on generative ai and LLM for video generation, understanding, and streaming. arXiv:2404.16038","DOI":"10.36227\/techrxiv.171172801.19993069\/v1"},{"key":"11450_CR117","unstructured":"Zhou D, Wang W, Yan H, Lv W, Zhu Y, Feng J (2022) Magicvideo: efficient video generation with latent diffusion models. arXiv preprint arXiv:2211.11018"},{"key":"11450_CR118","doi-asserted-by":"crossref","unstructured":"Zhou S, Yang P, Wang J, Luo Y, Loy CC (2024) Upscale-a-video: temporal-consistent diffusion model for real-world video super-resolution. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 2535\u20132545","DOI":"10.1109\/CVPR52733.2024.00245"}],"container-title":["Artificial Intelligence Review"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10462-025-11450-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10462-025-11450-0","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10462-025-11450-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,19]],"date-time":"2026-02-19T05:45:41Z","timestamp":1771479941000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10462-025-11450-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,24]]},"references-count":118,"journal-issue":{"issue":"2","published-online":{"date-parts":[[2026,2]]}},"alternative-id":["11450"],"URL":"https:\/\/doi.org\/10.1007\/s10462-025-11450-0","relation":{},"ISSN":["1573-7462"],"issn-type":[{"value":"1573-7462","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,24]]},"assertion":[{"value":"1 May 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 November 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 November 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"45"}}