{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T15:44:25Z","timestamp":1780501465893,"version":"3.54.1"},"reference-count":46,"publisher":"Springer Science and Business Media LLC","issue":"23","license":[{"start":{"date-parts":[[2021,6,22]],"date-time":"2021-06-22T00:00:00Z","timestamp":1624320000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,6,22]],"date-time":"2021-06-22T00:00:00Z","timestamp":1624320000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/100016714","name":"University of Sharjah","doi-asserted-by":"crossref","award":["Machine Learning and Arabic Language Processing"],"award-info":[{"award-number":["Machine Learning and Arabic Language Processing"]}],"id":[{"id":"10.13039\/100016714","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Comput &amp; Applic"],"published-print":{"date-parts":[[2021,12]]},"DOI":"10.1007\/s00521-021-06226-w","type":"journal-article","created":{"date-parts":[[2021,6,22]],"date-time":"2021-06-22T16:02:50Z","timestamp":1624377770000},"page":"16033-16055","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":29,"title":["Novel hybrid DNN approaches for speaker verification in emotional and stressful talking environments"],"prefix":"10.1007","volume":"33","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7856-9342","authenticated-orcid":false,"given":"Ismail","family":"Shahin","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ali Bou","family":"Nassif","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Nawel","family":"Nemmour","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ashraf","family":"Elnagar","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Adi","family":"Alhudhaif","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kemal","family":"Polat","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2021,6,22]]},"reference":[{"key":"6226_CR1","doi-asserted-by":"crossref","unstructured":"Reynolds DA (2002) An overview of automatic speaker recognition technology. In: IEEE international conference on acoustics speech and signal processing, Orlando, FL, USA. IEEE, pp IV-4072-IV\u20134075","DOI":"10.1109\/ICASSP.2002.5745552"},{"key":"6226_CR2","doi-asserted-by":"publisher","DOI":"10.1007\/s00521-018-3760-2","author":"I Shahin","year":"2020","unstructured":"Shahin I, Nassif AB, Hamsa S (2020) Novel cascaded Gaussian mixture model-deep neural network classifier for speaker identification in emotional talking environments. Neural Comput Appl https:\/\/doi.org\/10.1007\/s00521-018-3760-2","journal-title":"Neural Comput Appl"},{"key":"6226_CR3","doi-asserted-by":"publisher","first-page":"20","DOI":"10.18178\/ijmlc.2019.9.1.760","volume":"9","author":"A Irum","year":"2019","unstructured":"Irum A, Salman A (2019) Speaker verification using deep neural networks: a review. Int J Mach Learn Comput 9:20\u201325. https:\/\/doi.org\/10.18178\/ijmlc.2019.9.1.760","journal-title":"Int J Mach Learn Comput"},{"key":"6226_CR4","doi-asserted-by":"crossref","unstructured":"Shahin I, Nassif AB (2019) Speaker verification in emotional talking environments based on third-order circular suprasegmental hidden markov model. In 2019 International conference on electrical and computing technologies and applications (ICECTA), Ras Al Khaimah, UAE, pp 1\u20136","DOI":"10.1109\/ICECTA48151.2019.8959553"},{"key":"6226_CR5","doi-asserted-by":"crossref","unstructured":"Furui S (2009) Speaker recognition in smart environments. In Human-centric interfaces for ambient intelligence, Academic Press, pp 163\u2013184. ISBN 978-0-12-374708-2","DOI":"10.1016\/B978-0-12-374708-2.00007-3"},{"key":"6226_CR6","doi-asserted-by":"crossref","unstructured":"Wei Wu, Thomas Fang Zheng, Ming-Xing Xu and H-JB (2006) Study on Speaker Verification on Emotional Speech. In: INTERSPEECH 2006 - ICSLP, Ninth international conference on spoken language processing, Pittsburgh, PA, USA, pp 17\u201321, 2006","DOI":"10.21437\/Interspeech.2006-191"},{"key":"6226_CR7","unstructured":"Shahin I (2016) Emirati speaker verification based on HMMls, HMM2s, and HMM3s. In 2016 IEEE 13th international conference on signal processing, Chengdu, China, pp 562\u2013567"},{"key":"6226_CR8","doi-asserted-by":"publisher","first-page":"236","DOI":"10.1049\/iet-spr.2008.0175","volume":"3","author":"SG Pillay","year":"2009","unstructured":"Pillay SG, Ariyaeeinia A, Pawlewski M, Sivakumaran P (2009) Speaker verification under mismatched data conditions. Signal Process IET 3:236\u2013246. https:\/\/doi.org\/10.1049\/iet-spr.2008.0175","journal-title":"Signal Process IET"},{"key":"6226_CR9","doi-asserted-by":"crossref","unstructured":"Shahin I (2009) Verifying speakers in emotional environments. In: 2009 IEEE international symposium on signal processing and information technology (ISSPIT), Ajman, United Arab Emirates, pp 328\u2013333","DOI":"10.1109\/ISSPIT.2009.5407568"},{"key":"6226_CR10","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1515\/jisys-2014-0118","volume":"25","author":"I Shahin","year":"2016","unstructured":"Shahin I (2016) Employing emotion cues to verify speakers in emotional talking environments. J Intell Syst 25:3\u201317. https:\/\/doi.org\/10.1515\/jisys-2014-0118","journal-title":"J Intell Syst"},{"key":"6226_CR11","doi-asserted-by":"publisher","first-page":"915","DOI":"10.1007\/s10772-018-9543-4","volume":"21","author":"I Shahin","year":"2018","unstructured":"Shahin I, Nassif AB (2018) Three-stage speaker verification architecture in emotional talking environments. Int J Speech Technol 21:915\u2013930. https:\/\/doi.org\/10.1007\/s10772-018-9543-4","journal-title":"Int J Speech Technol"},{"key":"6226_CR12","doi-asserted-by":"publisher","first-page":"19143","DOI":"10.1109\/ACCESS.2019.2896880","volume":"7","author":"AB Nassif","year":"2019","unstructured":"Nassif AB, Shahin I, Attili I et al (2019) Speech recognition using deep neural networks: a systematic review. IEEE Access 7:19143\u201319165. https:\/\/doi.org\/10.1109\/ACCESS.2019.2896880","journal-title":"IEEE Access"},{"key":"6226_CR13","doi-asserted-by":"publisher","first-page":"107141","DOI":"10.1016\/j.asoc.2021.107141","volume":"103","author":"AB Nassif","year":"2021","unstructured":"Nassif AB, Shahin I, Hamsa S et al (2021) CASA-based speaker identification using cascaded GMM-CNN classifier in noisy and emotional talking conditions. Appl Soft Comput 103:107141. https:\/\/doi.org\/10.1016\/j.asoc.2021.107141","journal-title":"Appl Soft Comput"},{"key":"6226_CR14","doi-asserted-by":"publisher","unstructured":"Variani E, Lei X, McDermott E et al (2014) Deep neural networks for small footprint text-dependent speaker verification. In IEEE International Conference on Acoust Speech Signal Process. https:\/\/doi.org\/10.1109\/ICASSP.2014.6854363","DOI":"10.1109\/ICASSP.2014.6854363"},{"key":"6226_CR15","doi-asserted-by":"publisher","first-page":"3573","DOI":"10.21437\/Interspeech.2018-1158","volume":"2018","author":"Y Zhu","year":"2018","unstructured":"Zhu Y, Ko T, Snyder D et al (2018) Self-attentive speaker embeddings for text-independent speaker verification. Proc Interspeech 2018:3573\u20133577","journal-title":"Proc Interspeech"},{"key":"6226_CR16","doi-asserted-by":"crossref","unstructured":"Torfi A, Dawson J, Nasrabadi NM (2018) Text-independent speaker verification using 3D convolutional neural networks. In: 2018 IEEE international conference on multimedia and expo (ICME), San Diego, CA, USA, IEEE, pp 1\u20136","DOI":"10.1109\/ICME.2018.8486441"},{"key":"6226_CR17","doi-asserted-by":"publisher","first-page":"1420","DOI":"10.3390\/electronics9091420","volume":"9","author":"BH Prasetio","year":"2020","unstructured":"Prasetio BH, Tamura H, Tanno K (2020) Emotional variability analysis based i-vector for speaker verification in under-stress conditions. Electronics 9:1420. https:\/\/doi.org\/10.3390\/electronics9091420","journal-title":"Electronics"},{"key":"6226_CR18","doi-asserted-by":"publisher","first-page":"123","DOI":"10.1007\/s10772-019-09665-y","volume":"23","author":"S Hourri","year":"2020","unstructured":"Hourri S, Kharroubi J (2020) A deep learning approach for speaker recognition. Int J Speech Technol 23:123\u2013131. https:\/\/doi.org\/10.1007\/s10772-019-09665-y","journal-title":"Int J Speech Technol"},{"key":"6226_CR19","doi-asserted-by":"publisher","DOI":"10.1007\/s10772-021-09795-2","author":"S Hourri","year":"2021","unstructured":"Hourri S, Nikolov NS, Kharroubi J (2021) Convolutional neural network vectors for speaker recognition. Int J Speech Technol. https:\/\/doi.org\/10.1007\/s10772-021-09795-2","journal-title":"Int J Speech Technol"},{"key":"6226_CR20","doi-asserted-by":"publisher","first-page":"1243","DOI":"10.1109\/TASLP.2021.3065202","volume":"29","author":"X Chen","year":"2021","unstructured":"Chen X, Bao C (2021) Phoneme-unit-specific time-delay neural network for speaker verification. IEEE\/ACM Trans Audio Speech Lang Process 29:1243\u20131255. https:\/\/doi.org\/10.1109\/TASLP.2021.3065202","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"6226_CR21","doi-asserted-by":"publisher","first-page":"3548","DOI":"10.1007\/s00034-019-01103-3","volume":"38","author":"MA Laskar","year":"2019","unstructured":"Laskar MA, Laskar RH (2019) Integrating DNN\u2013HMM technique with hierarchical multi-layer acoustic model for text-dependent speaker verification. Circuits, Syst Signal Process 38:3548\u20133572. https:\/\/doi.org\/10.1007\/s00034-019-01103-3","journal-title":"Circuits, Syst Signal Process"},{"key":"6226_CR22","doi-asserted-by":"crossref","unstructured":"Bykov MM, Kovtun V, Kobylyanska I, W\u00f3jcik W, Smailova S (2019) Improvement of the learning process of the automated speaker recognition system for critical use with HMM-DNN component. In: Symposium on photonics applications in astronomy, communications, industry, and high-energy physics experiments, Wilga, Poland","DOI":"10.1117\/12.2536888"},{"key":"6226_CR23","doi-asserted-by":"crossref","unstructured":"Hansen JHL, Bou-Ghazale SE (1997) Getting started with susas: a speech under simulated and actual stress database. In: EUROSPEECH, Rhodes, Greece","DOI":"10.21437\/Eurospeech.1997-494"},{"key":"6226_CR24","doi-asserted-by":"publisher","first-page":"200","DOI":"10.1109\/TPAMI.2003.1177152","volume":"25","author":"ME Munich","year":"2003","unstructured":"Munich ME, Perona P (2003) Visual identification by signature tracking. IEEE Trans Pattern Anal Mach Intell 25:200\u2013217. https:\/\/doi.org\/10.1109\/TPAMI.2003.1177152","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"5","key":"6226_CR25","doi-asserted-by":"publisher","first-page":"e0196391","DOI":"10.1371\/journal.pone.0196391","volume":"13","author":"SR Livingstone","year":"2018","unstructured":"Livingstone SR, Russo FA (2018) The ryerson audio-visual database of emotional speech and song (RAVDESS): a dynamic, multimodal set of facial and vocal expressions in North American English. PLoS ONE 13(5):e0196391","journal-title":"PLoS ONE"},{"key":"6226_CR26","doi-asserted-by":"publisher","first-page":"201","DOI":"10.1109\/89.905995","volume":"9","author":"G Zhou","year":"2001","unstructured":"Zhou G, Hansen JHL, Kaiser JF (2001) Nonlinear feature based classification of speech under stress. IEEE Trans Speech Audio Process 9:201\u2013216. https:\/\/doi.org\/10.1109\/89.905995","journal-title":"IEEE Trans Speech Audio Process"},{"key":"6226_CR27","doi-asserted-by":"publisher","first-page":"1206","DOI":"10.1016\/j.specom.2009.06.005","volume":"51","author":"V Pitsikalis","year":"2009","unstructured":"Pitsikalis V, Maragos P (2009) Analysis and classification of speech signals by generalized fractal dimension features. Speech Commun 51:1206\u20131223. https:\/\/doi.org\/10.1016\/j.specom.2009.06.005","journal-title":"Speech Commun"},{"key":"6226_CR28","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1109\/TSA.2004.838534","volume":"13","author":"CM Lee","year":"2005","unstructured":"Lee CM, Narayanan SS (2005) Toward detecting emotions in spoken dialogs. IEEE Trans Speech Audio Process 13:293\u2013303. https:\/\/doi.org\/10.1109\/TSA.2004.838534","journal-title":"IEEE Trans Speech Audio Process"},{"key":"6226_CR29","doi-asserted-by":"publisher","first-page":"133","DOI":"10.1007\/s10772-012-9170-4","volume":"16","author":"IMA Shahin","year":"2013","unstructured":"Shahin IMA (2013) Gender-dependent emotion recognition based on HMMs and SPHMMs. Int J Speech Technol 16:133\u2013141. https:\/\/doi.org\/10.1007\/s10772-012-9170-4","journal-title":"Int J Speech Technol"},{"key":"6226_CR30","doi-asserted-by":"publisher","first-page":"341","DOI":"10.1007\/s10772-013-9188-2","volume":"16","author":"I Shahin","year":"2013","unstructured":"Shahin I (2013) Employing both gender and emotion cues to enhance speaker identification performance in emotional talking environments. Int J Speech Technol 16:341\u2013351. https:\/\/doi.org\/10.1007\/s10772-013-9188-2","journal-title":"Int J Speech Technol"},{"key":"6226_CR31","doi-asserted-by":"publisher","first-page":"89","DOI":"10.1007\/s10772-011-9089-1","volume":"14","author":"I Shahin","year":"2011","unstructured":"Shahin I (2011) Identifying speakers using their emotion cues. Int J Speech Technol 14:89\u201398. https:\/\/doi.org\/10.1007\/s10772-011-9089-1","journal-title":"Int J Speech Technol"},{"key":"6226_CR32","doi-asserted-by":"publisher","first-page":"316","DOI":"10.1016\/j.engappai.2014.07.006","volume":"35","author":"I Shahin","year":"2014","unstructured":"Shahin I (2014) Novel third-order hidden Markov models for speaker identification in shouted talking environments. Eng Appl Artif Intell 35:316\u2013323. https:\/\/doi.org\/10.1016\/j.engappai.2014.07.006","journal-title":"Eng Appl Artif Intell"},{"key":"6226_CR33","unstructured":"Li L, Wang D, Zhang Z, Zheng T (2015) Deep speaker vectors for semi text-independent speaker verification. pp 1\u20135. arXiv:1505.06427"},{"key":"6226_CR34","volume-title":"Speech communications: human and machine","author":"D O\u2019Shaughnessy","year":"1987","unstructured":"O\u2019Shaughnessy D (1987) Speech communications: human and machine. Addison-Wesley, Boston"},{"key":"6226_CR35","first-page":"3","volume":"34","author":"S Furui","year":"1986","unstructured":"Furui S (1986) Speaker-independent isolated word recognition using dynamic features of speech spectrum. IEEE Trans Acoust Speech Signal Process ASP 34:3\u20139","journal-title":"IEEE Trans Acoust Speech Signal Process ASP"},{"key":"6226_CR36","doi-asserted-by":"publisher","first-page":"3028","DOI":"10.1109\/IJCNN.2017.7966232","volume":"2017","author":"X Bao","year":"2017","unstructured":"Bao X, Gao T, Du J, Dai L (2017) An investigation of high-resolution modeling units of deep neural networks for acoustic scene classification. Int Jt Conf Neural Netw 2017:3028\u20133035. https:\/\/doi.org\/10.1109\/IJCNN.2017.7966232","journal-title":"Int Jt Conf Neural Netw"},{"key":"6226_CR37","doi-asserted-by":"publisher","first-page":"189","DOI":"10.1007\/978-3-319-71928-3_19","volume-title":"Mining intelligence and knowledge exploration","author":"VVR Vegesna","year":"2017","unstructured":"Vegesna VVR, Gurugubelli K, Vydana HK et al (2017) DNN-HMM acoustic modeling for large vocabulary telugu speech recognition. In: Ghosh A, Pal R, Prasath R (eds) Mining intelligence and knowledge exploration. Springer, Cham, pp 189\u2013197"},{"key":"6226_CR38","unstructured":"Stuart A, Ord K (1994) Kendall\u2019s advanced theory of statistics, Volume 1: distribution theory. 6th Edn, Edward Arnold, London"},{"key":"6226_CR39","doi-asserted-by":"publisher","first-page":"38","DOI":"10.1016\/j.bjoms.2007.09.002","volume":"46","author":"E McCrum-Gardner","year":"2008","unstructured":"McCrum-Gardner E (2008) Which is the correct statistical test to use? Br J Oral Maxillofac Surg 46:38\u201341. https:\/\/doi.org\/10.1016\/j.bjoms.2007.09.002","journal-title":"Br J Oral Maxillofac Surg"},{"key":"6226_CR40","first-page":"938","volume":"8","author":"K Kuppusamy","year":"2019","unstructured":"Kuppusamy K, Eswaran C (2019) Speech and speaker recognition: a review. Int J Sci Technol Res 8:938\u2013944","journal-title":"Int J Sci Technol Res"},{"key":"6226_CR41","doi-asserted-by":"publisher","unstructured":"Lalitha S, Madhavan A, Bhushan B, Saketh S (2015) Speech emotion recognition. In: 2014 international conference on advanced electronics computing and communication ICAECC Bangalore, India, pp 235\u2013238. https:\/\/doi.org\/10.1109\/ICAECC.2014.7002390","DOI":"10.1109\/ICAECC.2014.7002390"},{"key":"6226_CR42","doi-asserted-by":"crossref","unstructured":"Sarkar AK, Tan Z-H (2016) Text dependent speaker verification using un-supervised HMM-UBM and temporal GMM-UBM. In: Interspeech, San Francisco, CA, USA, pp 425\u2013429","DOI":"10.21437\/Interspeech.2016-362"},{"issue":"2","key":"6226_CR43","doi-asserted-by":"publisher","first-page":"884","DOI":"10.26438\/ijcse\/v7i2.884887","volume":"7","author":"RP Dharmistha","year":"2019","unstructured":"RP Dharmistha (2019) A survey on speaker recognition with various feature extraction techniques. Int J Comput Sci Eng, 7(2): 884\u2013887. https:\/\/doi.org\/10.26438\/ijcse\/v7i2.884887","journal-title":"Int J Comput Sci Eng"},{"key":"6226_CR44","first-page":"1","volume":"3","author":"BC Kamble","year":"2016","unstructured":"Kamble BC (2016) Speech recognition using artificial neural network\u2013a review. Int J Comput Commun Instrum Eng 3:1\u20134","journal-title":"Int J Comput Commun Instrum Eng"},{"key":"6226_CR45","doi-asserted-by":"publisher","first-page":"223","DOI":"10.1007\/s10772-013-9221-5","volume":"17","author":"E Zarrouk","year":"2014","unstructured":"Zarrouk E, Ben Ayed Y, Gargouri F (2014) Hybrid continuous speech recognition systems by HMM, MLP and SVM: a comparative study. Int J Speech Technol 17:223\u2013233. https:\/\/doi.org\/10.1007\/s10772-013-9221-5","journal-title":"Int J Speech Technol"},{"key":"6226_CR46","doi-asserted-by":"publisher","unstructured":"Jamal N, Shanta S, Mahmud F, Sha\u2019abani M (2017) Automatic speech recognition (ASR) based approach for speech therapy of aphasic patients: a review. In: Proceedings of the international conference on electrical and electronic engineering (IC3E 2017), AIP conference proceedings 1883, p 020028. https:\/\/doi.org\/10.1063\/1.5002046","DOI":"10.1063\/1.5002046"}],"container-title":["Neural Computing and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-021-06226-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00521-021-06226-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-021-06226-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T05:25:00Z","timestamp":1672550700000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00521-021-06226-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,6,22]]},"references-count":46,"journal-issue":{"issue":"23","published-print":{"date-parts":[[2021,12]]}},"alternative-id":["6226"],"URL":"https:\/\/doi.org\/10.1007\/s00521-021-06226-w","relation":{},"ISSN":["0941-0643","1433-3058"],"issn-type":[{"value":"0941-0643","type":"print"},{"value":"1433-3058","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,6,22]]},"assertion":[{"value":"8 April 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 June 2021","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 June 2021","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of Interest"}},{"value":"This study does not involve any experiments on animals.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Informed consent"}},{"value":"The authors have authorization from the \u201cUniversity of Sharjah to gather speech database from UAE nationals based on the competitive research project entitled Emirati-Accented Speaker and Emotion Recognition Based on Deep Neural Network, No. 19020403139.\u201d","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}}]}}