{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T18:29:12Z","timestamp":1772821752503,"version":"3.50.1"},"reference-count":54,"publisher":"Springer Science and Business Media LLC","issue":"35","license":[{"start":{"date-parts":[[2024,9,23]],"date-time":"2024-09-23T00:00:00Z","timestamp":1727049600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,23]],"date-time":"2024-09-23T00:00:00Z","timestamp":1727049600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/100016714","name":"University of Sharjah","doi-asserted-by":"crossref","award":["NA"],"award-info":[{"award-number":["NA"]}],"id":[{"id":"10.13039\/100016714","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Comput &amp; Applic"],"published-print":{"date-parts":[[2024,12]]},"DOI":"10.1007\/s00521-024-10389-7","type":"journal-article","created":{"date-parts":[[2024,9,23]],"date-time":"2024-09-23T09:01:53Z","timestamp":1727082113000},"page":"22569-22586","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["ViT-LSTM synergy: a multi-feature approach for speaker identification and mask detection"],"prefix":"10.1007","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1570-0897","authenticated-orcid":false,"given":"Ali Bou","family":"Nassif","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ismail","family":"Shahin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mohamed","family":"Bader","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Abdelfatah","family":"Ahmed","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Naoufel","family":"Werghi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,9,23]]},"reference":[{"key":"10389_CR1","doi-asserted-by":"publisher","first-page":"315","DOI":"10.1016\/j.asoc.2017.10.036","volume":"62","author":"S Shahmoradi","year":"2018","unstructured":"Shahmoradi S, Shouraki SB (2018) Evaluation of a novel fuzzy sequential pattern recognition tool (fuzzy elastic matching machine) and its applications in speech and handwriting recognition. Appl Soft Comput 62:315\u2013327","journal-title":"Appl Soft Comput"},{"key":"10389_CR2","doi-asserted-by":"publisher","first-page":"217","DOI":"10.1016\/j.asoc.2017.03.013","volume":"56","author":"C Yogesh","year":"2017","unstructured":"Yogesh C, Hariharan M, Ngadiran R, Adom AH, Yaacob S, Polat K (2017) Hybrid bbo_pso and higher order spectral features for emotion and stress recognition from natural speech. Appl Soft Comput 56:217\u2013232","journal-title":"Appl Soft Comput"},{"key":"10389_CR3","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.119871","volume":"224","author":"S Hamsa","year":"2023","unstructured":"Hamsa S, Shahin I, Iraqi Y, Damiani E, Nassif AB, Werghi N (2023) Speaker identification from emotional and noisy speech using learned voice segregation and speech VGG. Expert Syst Appl 224:119871","journal-title":"Expert Syst Appl"},{"key":"10389_CR4","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2021.116469","volume":"193","author":"AB Nassif","year":"2022","unstructured":"Nassif AB, Shahin I, Elnagar A, Velayudhan D, Alhudhaif A, Polat K (2022) Emotional speaker identification using a novel capsule nets model. Expert Syst Appl 193:116469","journal-title":"Expert Syst Appl"},{"key":"10389_CR5","doi-asserted-by":"crossref","unstructured":"Bader M, Shahin I, Ahmed A, Werghi N (2022) Hybrid CNN-LSTM speaker identification framework for evaluating the impact of face masks. In: 2022 international conference on electrical and computing technologies and applications (ICECTA), IEEE, pp 118\u2013121","DOI":"10.1109\/ICECTA57148.2022.9990138"},{"key":"10389_CR6","doi-asserted-by":"crossref","unstructured":"Bader M, Shahin I, Ahmed A, Werghi N (2022) Studying the effect of face masks in identifying speakers using lstm. In: 2022 international conference on electrical and computing technologies and applications (ICECTA), IEEE, pp 99\u2013102","DOI":"10.1109\/ICECTA57148.2022.9990479"},{"key":"10389_CR7","unstructured":"Subakan C, Ravanelli M, Cornell S, Grondin F, Bronzi M (2022) On using transformers for speech-separation. arXiv:2202.02884"},{"key":"10389_CR8","doi-asserted-by":"publisher","first-page":"3770","DOI":"10.1007\/s00034-015-0220-4","volume":"35","author":"IM Shahin","year":"2016","unstructured":"Shahin IM (2016) Speaker identification in a shouted talking environment based on novel third-order circular suprasegmental hidden Markov models. Circ Syst Signal Process 35:3770\u20133792","journal-title":"Circ Syst Signal Process"},{"key":"10389_CR9","doi-asserted-by":"crossref","unstructured":"Wittum KJ, Feth L, Hoglund E (2013) The effects of surgical masks on speech perception in noise. In: Proceedings of meetings on acoustics, vol. 19, AIP Publishing","DOI":"10.1121\/1.4800719"},{"issue":"3","key":"10389_CR10","doi-asserted-by":"publisher","first-page":"0283724","DOI":"10.1371\/journal.pone.0283724","volume":"18","author":"P Geng","year":"2023","unstructured":"Geng P, Lu Q, Guo H, Zeng J (2023) The effects of face mask on speech production and its implication for forensic speaker identification-a cross-linguistic study. PloS One 18(3):0283724","journal-title":"PloS One"},{"key":"10389_CR11","doi-asserted-by":"crossref","unstructured":"Yang Z, An Z, Fan Z, Jing C, Cao H (2020) Exploration of acoustic and lexical cues for the interspeech 2020 computational paralinguistic challenge. INTERSPEECH 2020","DOI":"10.21437\/Interspeech.2020-2999"},{"key":"10389_CR12","first-page":"4085","volume":"75","author":"AA Khan","year":"2023","unstructured":"Khan AA, Jahangir R, Alroobaea R, Alyahyan SY, Almulhi AH, Alsafyani M, Wechtaisong C (2023) An efficient text-independent speaker identification using feature fusion and transformer model. Comput Mater Contin 75:4085\u20134100","journal-title":"Comput Mater Contin"},{"issue":"2","key":"10389_CR13","doi-asserted-by":"publisher","first-page":"385","DOI":"10.3758\/BRM.41.2.385","volume":"41","author":"NH De Jong","year":"2009","unstructured":"De Jong NH, Wempe T (2009) Praat script to detect syllable nuclei and measure speech rate automatically. Behav Res Methods 41(2):385\u2013390","journal-title":"Behav Res Methods"},{"issue":"1","key":"10389_CR14","first-page":"186","volume":"15","author":"YA Ibrahim","year":"2017","unstructured":"Ibrahim YA, Odiketa JC, Ibiyemi TS (2017) Preprocessing technique in automatic speech recognition for human computer interaction: an overview. Ann Comput Sci Ser 15(1):186\u2013191","journal-title":"Ann Comput Sci Ser"},{"issue":"10","key":"10389_CR15","doi-asserted-by":"publisher","first-page":"78","DOI":"10.1145\/2347736.2347755","volume":"55","author":"P Domingos","year":"2012","unstructured":"Domingos P (2012) A few useful things to know about machine learning. Commun ACM 55(10):78\u201387","journal-title":"Commun ACM"},{"key":"10389_CR16","doi-asserted-by":"crossref","unstructured":"Tzirakis P, Zhang J, Schuller B (2018) End-to-end speech emotion recognition using a deep convolutional recurrent network. In: Proceedings of the 2018 IEEE international conference on acoustics, speech and signal processing, ICASSP, Calgary, AB, Canada, pp 15\u201320","DOI":"10.1109\/ICASSP.2018.8462677"},{"issue":"12","key":"10389_CR17","doi-asserted-by":"publisher","first-page":"18006","DOI":"10.15680\/IJIRSET.2014.0312034","volume":"3","author":"PK Kurzekar","year":"2014","unstructured":"Kurzekar PK, Deshmukh RR, Waghmare VB, Shrishrimal PP (2014) A comparative study of feature extraction techniques for speech recognition system. Int J Innov Res Sci Eng Technol 3(12):18006\u201318016","journal-title":"Int J Innov Res Sci Eng Technol"},{"key":"10389_CR18","doi-asserted-by":"publisher","first-page":"316","DOI":"10.1016\/j.engappai.2014.07.006","volume":"35","author":"I Shahin","year":"2014","unstructured":"Shahin I (2014) Novel third-order hidden Markov models for speaker identification in shouted talking environments. Eng Appl Artif Intell 35:316\u2013323","journal-title":"Eng Appl Artif Intell"},{"issue":"1","key":"10389_CR19","doi-asserted-by":"publisher","first-page":"443","DOI":"10.1121\/1.2205131","volume":"120","author":"K Ishizuka","year":"2006","unstructured":"Ishizuka K, Nakatani T, Minami Y, Miyazaki N (2006) Speech feature extraction method using subband-based periodicity and nonperiodicity decomposition. J Acous Soc America 120(1):443\u2013452","journal-title":"J Acous Soc America"},{"key":"10389_CR20","unstructured":"Muda L, Begam M, Elamvazuthi I (2010) Voice recognition algorithms using mel frequency cepstral coefficient (mfcc) and dynamic time warping (dtw) techniques. arXiv:1003.4083"},{"issue":"1","key":"10389_CR21","doi-asserted-by":"publisher","first-page":"12","DOI":"10.1016\/j.specom.2009.08.009","volume":"52","author":"T Kinnunen","year":"2010","unstructured":"Kinnunen T, Li H (2010) An overview of text-independent speaker recognition: from features to supervectors. Speech Commun 52(1):12\u201340","journal-title":"Speech Commun"},{"key":"10389_CR22","unstructured":"Abdalla MI, Ali HS (2010) Wavelet-based mel-frequency cepstral coefficients for speaker identification using hidden markov models. arXiv:1003.5627"},{"issue":"7C2","key":"10389_CR23","first-page":"197","volume":"8","author":"R Ranjan","year":"2019","unstructured":"Ranjan R, Thakur A (2019) Analysis of feature extraction techniques for speech recognition system. Int J Innov Technol Explor Eng 8(7C2):197\u2013200","journal-title":"Int J Innov Technol Explor Eng"},{"key":"10389_CR24","doi-asserted-by":"crossref","unstructured":"Bachu RG, Kopparthi S, Adapa B, Barkana BD (2010) Voiced\/unvoiced decision for speech signals based on zero-crossing rate and energy. In: Advanced techniques in computing sciences and software engineering, Springer, pp 279\u2013282","DOI":"10.1007\/978-90-481-3660-5_47"},{"issue":"2","key":"10389_CR25","doi-asserted-by":"publisher","first-page":"659","DOI":"10.1016\/j.dsp.2012.10.008","volume":"23","author":"M Kos","year":"2013","unstructured":"Kos M, Ka\u010di\u010d Z, Vlaj D (2013) Acoustic classification and segmentation using modified spectral roll-off and variance-based features. Digit Signal Process 23(2):659\u2013674","journal-title":"Digit Signal Process"},{"key":"10389_CR26","doi-asserted-by":"crossref","unstructured":"Staudinger T, Polikar R (2011) Analysis of complexity based eeg features for the diagnosis of alzheimer\u2019s disease. In: 2011 annual international conference of the ieee engineering in medicine and biology society, IEEE, pp 2033\u20132036","DOI":"10.1109\/IEMBS.2011.6090374"},{"key":"10389_CR27","unstructured":"Thornton B (2019) Audio recognition using mel spectrograms and convolution neural networks"},{"key":"10389_CR28","doi-asserted-by":"publisher","first-page":"221640","DOI":"10.1109\/ACCESS.2020.3043201","volume":"8","author":"MB Er","year":"2020","unstructured":"Er MB (2020) A novel approach for classification of speech emotions based on deep and acoustic features. IEEE Access 8:221640\u2013221653","journal-title":"IEEE Access"},{"key":"10389_CR29","unstructured":"Shah A, Kattel M, Nepal A, Shrestha D (2019) Chroma feature extraction: chroma feature extraction using fourier transform"},{"key":"10389_CR30","doi-asserted-by":"publisher","DOI":"10.1016\/j.apacoust.2019.107020","volume":"158","author":"G Sharma","year":"2020","unstructured":"Sharma G, Umapathy K, Krishnan S (2020) Trends in audio signal feature extraction methods. Appl Acoust 158:107020","journal-title":"Appl Acoust"},{"issue":"4","key":"10389_CR31","doi-asserted-by":"publisher","first-page":"839","DOI":"10.3390\/electronics12040839","volume":"12","author":"K Bhangale","year":"2023","unstructured":"Bhangale K, Kothandaraman M (2023) Speech emotion recognition based on multiple acoustic features and deep convolutional neural network. Electronics 12(4):839","journal-title":"Electronics"},{"key":"10389_CR32","doi-asserted-by":"crossref","unstructured":"Vivek V, Vidhya S, Madhanmohan P (2020) Acoustic scene classification in hearing aid using deep learning. In: 2020 International Conference on Communication and Signal Processing (ICCSP), pp. 0695\u20130699. IEEE","DOI":"10.1109\/ICCSP48568.2020.9182160"},{"key":"10389_CR33","doi-asserted-by":"crossref","unstructured":"Veltman A, Pulle DW, De\u00a0Doncker RW, Veltman A, Pulle DW, De\u00a0Doncker RW (2016) The transformer.  In: Fundamentals of electrical drives, pp 47\u201382","DOI":"10.1007\/978-3-319-29409-4_3"},{"key":"10389_CR34","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, et al (2020) An image is worth 16x16 words: transformers for image recognition at scale. arXiv:2010.11929"},{"key":"10389_CR35","unstructured":"Gong C, Wang D, Li M, Chandra V, Liu Q (2021) Vision transformers with patch diversification. arXiv:2104.12753"},{"issue":"6","key":"10389_CR36","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3530811","volume":"55","author":"Y Tay","year":"2022","unstructured":"Tay Y, Dehghani M, Bahri D, Metzler D (2022) Efficient transformers: a survey. ACM Comput Surv 55(6):1\u201328. https:\/\/doi.org\/10.1145\/3530811","journal-title":"ACM Comput Surv"},{"key":"10389_CR37","unstructured":"Ba JL, Kiros JR, Hinton GE (2016) Layer normalization. arXiv:1607.06450"},{"key":"10389_CR38","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. In: Proc Adv Neural Inf Process Syst, vol 30"},{"key":"10389_CR39","doi-asserted-by":"publisher","unstructured":"Dong L, Xu S, Xu B (2018) Speech-transformer: A no-recurrence sequence-to-sequence model for speech recognition. In: 2018 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 5884\u20135888. https:\/\/doi.org\/10.1109\/ICASSP.2018.8462506","DOI":"10.1109\/ICASSP.2018.8462506"},{"key":"10389_CR40","unstructured":"Devlin J, Chang M-W, Lee K, Toutanova K (2018) Bert: pre-training of deep bidirectional transformers for language understanding. arXiv:1810.04805"},{"issue":"1","key":"10389_CR41","first-page":"5485","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel C, Shazeer N, Roberts A, Lee K, Narang S, Matena M, Zhou Y, Li W, Liu PJ (2020) Exploring the limits of transfer learning with a unified text-to-text transformer. J Mach Learn Res 21(1):5485\u20135551","journal-title":"J Mach Learn Res"},{"key":"10389_CR42","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown T, Mann B, Ryder N, Subbiah M, Kaplan JD, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A et al (2020) Language models are few-shot learners. Adv Neural Inf Process Syst 33:1877\u20131901","journal-title":"Adv Neural Inf Process Syst"},{"issue":"1","key":"10389_CR43","doi-asserted-by":"publisher","first-page":"11554","DOI":"10.1038\/s41598-022-15163-0","volume":"12","author":"Y Borhani","year":"2022","unstructured":"Borhani Y, Khoramdel J, Najafi E (2022) A deep learning based approach for automated plant disease classification using vision transformer. Sci Rep 12(1):11554","journal-title":"Sci Rep"},{"key":"10389_CR44","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2022.109552","volume":"253","author":"S Zuo","year":"2022","unstructured":"Zuo S, Xiao Y, Chang X, Wang X (2022) Vision transformers for dense prediction: a survey. Knowl Based Syst 253:109552","journal-title":"Knowl Based Syst"},{"key":"10389_CR45","unstructured":"Touvron H, Cord M, Douze M, Massa F, Sablayrolles A, J\u00e9gou H (2021) Training data-efficient image transformers & distillation through attention. In: International conference on machine learning, PMLR, pp 10347\u201310357"},{"key":"10389_CR46","unstructured":"Ulhaq A, Akhtar N, Pogrebna G, Mian A (2022) Vision transformers for action recognition: a survey. arXiv:2209.05700"},{"issue":"5","key":"10389_CR47","doi-asserted-by":"publisher","first-page":"0196391","DOI":"10.1371\/journal.pone.0196391","volume":"13","author":"SR Livingstone","year":"2018","unstructured":"Livingstone SR, Russo FA (2018) The Ryerson audio-visual database of emotional speech and song (RAVDESS): a dynamic, multimodal set of facial and vocal expressions in north american english. PloS One 13(5):0196391","journal-title":"PloS One"},{"key":"10389_CR48","doi-asserted-by":"publisher","unstructured":"Abunasser BS, AL-Hiealy MRJ, Barhoom AM, Almasri AR, Abu-Naser SS (2022) Prediction of instructor performance using machine and deep learning techniques. Int J Adv Comput Sci Appl 13(7). https:\/\/doi.org\/10.14569\/IJACSA.2022.0130711","DOI":"10.14569\/IJACSA.2022.0130711"},{"issue":"1","key":"10389_CR49","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s40537-022-00561-y","volume":"9","author":"A Chiche","year":"2022","unstructured":"Chiche A, Yitagesu B (2022) Part of speech tagging: a systematic review of deep learning and machine learning approaches. J Big Data 9(1):1\u201325","journal-title":"J Big Data"},{"key":"10389_CR50","series-title":"Lecture notes in networks and systems","doi-asserted-by":"publisher","first-page":"771","DOI":"10.1007\/978-3-031-37963-5_53","volume-title":"Intelligent computing. SAI 2023","author":"R Cahuantzi","year":"2023","unstructured":"Cahuantzi R, Chen X, G\u00fcttel S (2023) A comparison of LSTM and GRU networks for learning symbolic sequences. In: Arai K (ed) Intelligent computing. SAI 2023. Lecture notes in networks and systems, vol 739. Springer, Cham, pp 771\u2013785. https:\/\/doi.org\/10.1007\/978-3-031-37963-5_53"},{"issue":"5","key":"10389_CR51","first-page":"350","volume":"16","author":"HW Al-Dulaimi","year":"2023","unstructured":"Al-Dulaimi HW, Aldhahab A, Al Abboodi HM (2023) Speaker identification system employing multi-resolution analysis in conjunction with CNN. Int J Intell Eng Syst 16(5):350\u2013361","journal-title":"Int J Intell Eng Syst"},{"key":"10389_CR52","doi-asserted-by":"crossref","unstructured":"Sefara TJ, Mokgonyane TB (2020) Emotional speaker recognition based on machine and deep learning. In: 2020 2nd international multidisciplinary information technology and engineering conference (IMITEC), IEEE, pp 1\u20138","DOI":"10.1109\/IMITEC50163.2020.9334138"},{"key":"10389_CR53","doi-asserted-by":"crossref","unstructured":"Al\u00a0Hindawi NA, Shahin I, Nassif AB (2021) Speaker identification for disguised voices based on modified SVM classifier. In: 2021 18th international multi-conference on systems, signals & devices (SSD), IEEE, pp 687\u2013691","DOI":"10.1109\/SSD52085.2021.9429403"},{"issue":"10","key":"10389_CR54","doi-asserted-by":"publisher","first-page":"2636","DOI":"10.1080\/03610918.2014.931971","volume":"44","author":"DG Pereira","year":"2015","unstructured":"Pereira DG, Afonso A, Medeiros FM (2015) Overview of Friedman\u2019s test and post-hoc analysis. Commun Stat Simul Comput 44(10):2636\u20132653","journal-title":"Commun Stat Simul Comput"}],"container-title":["Neural Computing and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-024-10389-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00521-024-10389-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-024-10389-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,25]],"date-time":"2024-11-25T12:08:24Z","timestamp":1732536504000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00521-024-10389-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,23]]},"references-count":54,"journal-issue":{"issue":"35","published-print":{"date-parts":[[2024,12]]}},"alternative-id":["10389"],"URL":"https:\/\/doi.org\/10.1007\/s00521-024-10389-7","relation":{},"ISSN":["0941-0643","1433-3058"],"issn-type":[{"value":"0941-0643","type":"print"},{"value":"1433-3058","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,23]]},"assertion":[{"value":"17 December 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 August 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 September 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"This study does not involve any experiments on animals.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Informed consent"}}]}}