{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,8]],"date-time":"2026-04-08T16:48:58Z","timestamp":1775666938942,"version":"3.50.1"},"reference-count":233,"publisher":"Springer Science and Business Media LLC","issue":"16","license":[{"start":{"date-parts":[[2021,1,2]],"date-time":"2021-01-02T00:00:00Z","timestamp":1609545600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,2]],"date-time":"2021-01-02T00:00:00Z","timestamp":1609545600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2021,7]]},"DOI":"10.1007\/s11042-020-09874-7","type":"journal-article","created":{"date-parts":[[2021,1,2]],"date-time":"2021-01-02T10:03:09Z","timestamp":1609581789000},"page":"23745-23812","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":86,"title":["Deep learning approaches for speech emotion recognition: state of the art and research challenges"],"prefix":"10.1007","volume":"80","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1129-6006","authenticated-orcid":false,"given":"Rashid","family":"Jahangir","sequence":"first","affiliation":[]},{"given":"Ying Wah","family":"Teh","sequence":"additional","affiliation":[]},{"given":"Faiqa","family":"Hanif","sequence":"additional","affiliation":[]},{"given":"Ghulam","family":"Mujtaba","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,1,2]]},"reference":[{"key":"9874_CR1","doi-asserted-by":"crossref","unstructured":"Abadi M, Chu A, Goodfellow I, McMahan HB, Mironov I, Talwar K, Zhang L (2016b). Deep learning with differential privacy. In: Proceedings of the 2016 ACM SIGSAC conference on computer and communications security. ACM, pp 308\u2013318","DOI":"10.1145\/2976749.2978318"},{"key":"9874_CR2","unstructured":"Abadi M et al. (2016a). Tensorflow: large-scale machine learning on heterogeneous distributed systems arXiv preprint arXiv:160304467"},{"key":"9874_CR3","doi-asserted-by":"crossref","unstructured":"Adam T, Salam M, Gunawan TS (2013). Wavelet based Cepstral Coefficients for neural network speech recognition. In: 2013 IEEE International Conference on Signal and Image Processing Applications. IEEE, pp 447\u2013451","DOI":"10.1109\/ICSIPA.2013.6708048"},{"key":"9874_CR4","doi-asserted-by":"crossref","first-page":"39","DOI":"10.5121\/acij.2016.7205","volume":"7","author":"R Alghamdi","year":"2016","unstructured":"Alghamdi R (2016) Hidden Markov Models (HMMs) and Security Applications. Int J Adv Comput Sci Appl 7:39\u201347","journal-title":"Int J Adv Comput Sci Appl"},{"key":"9874_CR5","doi-asserted-by":"crossref","unstructured":"Anoop V, Rao P, Aruna S (2018). An effective speech emotion recognition using artificial neural networks. In: International proceedings on advances in soft computing, Intelligent Systems and Applications. Springer, pp. 393\u2013401","DOI":"10.1007\/978-981-10-5272-9_36"},{"key":"9874_CR6","first-page":"14","volume":"20","author":"M A-r","year":"2011","unstructured":"A-r M, Dahl GE, Hinton G (2011) Acoustic modeling using deep belief networks. IEEE Trans Audio Speech Lang Process 20:14\u201322","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"9874_CR7","doi-asserted-by":"crossref","unstructured":"Arshad H, Khan MA, Sharif M, Yasmin M, Javed MYJIJoML, cybernetics (2019). Multi-level features fusion and selection for human gait recognition: an optimized framework of Bayesian model and binomial distribution 10:3601\u20133618","DOI":"10.1007\/s13042-019-00947-0"},{"key":"9874_CR8","doi-asserted-by":"crossref","unstructured":"Arshad H, Khan MA, Sharif MI, Yasmin M, Tavares JMR, Zhang YD, Satapathy SCJES (2020). A multilevel paradigm for deep convolutional neural network features selection with an application to human gait recognition:e12541","DOI":"10.1111\/exsy.12541"},{"key":"9874_CR9","unstructured":"Automation C (2010) CASIA Chinese emotional Corpus. Institute of Automation, Chinese Academy of Sciences. http:\/\/www.chineseldc.org\/doc\/CLDC-SPC-2005-010\/report.htm. 2010"},{"key":"9874_CR10","doi-asserted-by":"crossref","unstructured":"Aytar Y, Vondrick C, Torralba A (2016). Soundnet: learning sound representations from unlabeled video. In: Advances in neural information processing systems. pp. 892\u2013900","DOI":"10.1109\/CVPR.2016.18"},{"key":"9874_CR11","doi-asserted-by":"crossref","unstructured":"Badshah AM et al. (2019). Deep features-based speech emotion recognition for smart affective services multimedia tools and applications 78:5571-5589","DOI":"10.1007\/s11042-017-5292-7"},{"key":"9874_CR12","doi-asserted-by":"crossref","first-page":"614","DOI":"10.1037\/0022-3514.70.3.614","volume":"70","author":"R Banse","year":"1996","unstructured":"Banse R, Scherer KR (1996) Acoustic profiles in vocal emotion expression. J Pers Soc Psychol 70:614\u2013636","journal-title":"J Pers Soc Psychol"},{"key":"9874_CR13","doi-asserted-by":"crossref","unstructured":"Bargal SA, Barsoum E, Ferrer CC, Zhang C (2016). Emotion recognition in the wild from videos using images. In: Proceedings of the 18th ACM International Conference on Multimodal Interaction. ACM, pp 433\u2013436","DOI":"10.1145\/2993148.2997627"},{"key":"9874_CR14","first-page":"1","volume":"2","author":"Y Bengio","year":"2009","unstructured":"Bengio Y (2009) Learning deep architectures for AI foundations and trends\u00ae in. Mach Learn 2:1\u2013127","journal-title":"Mach Learn"},{"key":"9874_CR15","first-page":"1","volume":"2","author":"U Bhattacharjee","year":"2013","unstructured":"Bhattacharjee U (2013) A comparative study of LPCC and MFCC features for the recognition of Assamese phonemes. International journal of engineering research and technology 2:1\u20136","journal-title":"International journal of engineering research and technology"},{"key":"9874_CR16","doi-asserted-by":"crossref","first-page":"55","DOI":"10.1109\/TIP.2012.2210727","volume":"22","author":"A Borji","year":"2012","unstructured":"Borji A, Sihite DN, Itti L (2012) Quantitative analysis of human-model agreement in visual saliency modeling: A comparative study. IEEE Trans Image Process 22:55\u201369","journal-title":"IEEE Trans Image Process"},{"key":"9874_CR17","unstructured":"Brownlee J (2019). Deep Learning & Artificial Neural Networks. Machine learning mastery. https:\/\/machinelearningmastery.com\/what-is-deep-learning\/. 2019"},{"key":"9874_CR18","doi-asserted-by":"crossref","first-page":"335","DOI":"10.1007\/s10579-008-9076-6","volume":"42","author":"C Busso","year":"2008","unstructured":"Busso C et al (2008) IEMOCAP: Interactive emotional dyadic motion capture database. Lang Resour Eval 42:335","journal-title":"Lang Resour Eval"},{"key":"9874_CR19","doi-asserted-by":"crossref","unstructured":"Cairong Z, Xinran Z, Cheng Z, Li Z (2016). A novel DBN feature fusion model for cross-corpus speech emotion recognition Journal of Electrical and Computer Engineering 2016","DOI":"10.1155\/2016\/7437860"},{"key":"9874_CR20","unstructured":"Campbell N (2000). Databases of emotional speech. In: ISCA Tutorial and Research Workshop (ITRW) on Speech and Emotion"},{"key":"9874_CR21","doi-asserted-by":"crossref","unstructured":"Chen L, Su W, Feng Y, Wu M, She J, Hirota KJIS (2020). Two-layer fuzzy multiple random forest for speech emotion recognition in human-robot interaction 509:150\u2013163","DOI":"10.1016\/j.ins.2019.09.005"},{"key":"9874_CR22","doi-asserted-by":"crossref","unstructured":"Chen R, Zhou Y, Qian Y (2018). Emotion Recognition Using Support Vector Machine and Deep Neural Network. In, Singapore. Man-machine speech communication. Springer Singapore, pp 122\u2013131","DOI":"10.1007\/978-981-10-8111-8_12"},{"key":"9874_CR23","unstructured":"Chernykh V, Prikhodko P (2017). Emotion recognition from speech with recurrent neural networks arXiv preprint arXiv:170108071"},{"key":"9874_CR24","unstructured":"Chung J, Gulcehre C, Cho K, Bengio Y (2014). Empirical evaluation of gated recurrent neural networks on sequence modeling arXiv preprint arXiv:14123555"},{"key":"9874_CR25","doi-asserted-by":"crossref","unstructured":"Coetzee H, Barnwell T An LSP (1989). Based speech quality measure. In: International Conference on Acoustics, Speech, and Signal Processing. IEEE, pp 596\u2013599","DOI":"10.1109\/ICASSP.1989.266497"},{"key":"9874_CR26","unstructured":"Costantini G, Iaderola I, Paoloni A, Todisco M (2014). Emovo corpus: an italian emotional speech database. In: International Conference on Language Resources and Evaluation (LREC 2014). European Language Resources Association (ELRA), pp 3501\u20133504"},{"key":"9874_CR27","doi-asserted-by":"crossref","first-page":"5","DOI":"10.1016\/S0167-6393(02)00071-7","volume":"40","author":"R Cowie","year":"2003","unstructured":"Cowie R, Cornelius RR (2003) Describing the emotional states that are expressed in speech. Speech Comm 40:5\u201332","journal-title":"Speech Comm"},{"key":"9874_CR28","doi-asserted-by":"crossref","first-page":"32","DOI":"10.1109\/79.911197","volume":"18","author":"R Cowie","year":"2001","unstructured":"Cowie R, Douglas-Cowie E, Tsapatsoulis N, Votsis G, Kollias S, Fellenz W, Taylor JG (2001) Emotion recognition in human-computer interaction. IEEE Signal Process Mag 18:32\u201380","journal-title":"IEEE Signal Process Mag"},{"key":"9874_CR29","doi-asserted-by":"crossref","first-page":"25","DOI":"10.1049\/iet-spr.2012.0151","volume":"7","author":"M Cutajar","year":"2013","unstructured":"Cutajar M, Gatt E, Grech I, Casha O, Micallef J (2013) Comparative study of automatic speech recognition techniques. IET Signal Proc 7:25\u201346","journal-title":"IET Signal Proc"},{"key":"9874_CR30","unstructured":"Degirmenci A (2014). Introduction to hidden Markov models Harvard University,[online] available from: https:\/\/scholar.google.com\/scholar?hl=en&as_sdt=0%2C5&q=Degirmenci+A+%282014%29.+Introduction+to+hidden+Markov+models+Harvard+University&btnG=. Accessed 10 Oct 2016"},{"key":"9874_CR31","doi-asserted-by":"crossref","unstructured":"Degottex G, Kane J, Drugman T, Raitio T, Scherer S (2014). COVAREP\u2014A collaborative voice analysis repository for speech technologies. In: 2014 ieee international conference on acoustics, speech and signal processing (icassp). IEEE, pp 960\u2013964","DOI":"10.1109\/ICASSP.2014.6853739"},{"key":"9874_CR32","doi-asserted-by":"crossref","unstructured":"Deng L (2014). A tutorial survey of architectures, algorithms, and applications for deep learning APSIPA Transactions on Signal and Information Processing 3","DOI":"10.1017\/ATSIP.2014.4"},{"key":"9874_CR33","first-page":"5235","volume":"5","author":"J Deng","year":"2017","unstructured":"Deng J, Fr\u00fchholz S, Zhang Z, Schuller B (2017a) Recognizing emotions from whispered speech based on acoustic feature transfer learning. IEEE Access 5:5235\u20135246","journal-title":"IEEE Access"},{"key":"9874_CR34","doi-asserted-by":"crossref","unstructured":"Deng J, Xia R, Zhang Z, Liu Y, Schuller B (2014). Introducing shared-hidden-layer autoencoders for transfer learning and their application in acoustic emotion recognition. In: 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, pp 4818\u20134822","DOI":"10.1109\/ICASSP.2014.6854517"},{"key":"9874_CR35","doi-asserted-by":"crossref","unstructured":"Deng J, Xu X, Zhang Z, Fr\u00fchholz S, Grandjean D, Schuller B (2017b). Fisher kernels on phase-based features for speech emotion recognition. In: Dialogues with social robots. Springer, pp. 195\u2013203","DOI":"10.1007\/978-981-10-2585-3_15"},{"key":"9874_CR36","first-page":"31","volume":"26","author":"J Deng","year":"2017","unstructured":"Deng J, Xu X, Zhang Z, Fr\u00fchholz S, Schuller B (2017c) Semisupervised autoencoders for speech emotion recognition IEEE\/ACM transactions on audio. Speech, and Language Processing 26:31\u201343","journal-title":"Speech, and Language Processing"},{"key":"9874_CR37","doi-asserted-by":"crossref","first-page":"500","DOI":"10.1109\/LSP.2017.2672753","volume":"24","author":"J Deng","year":"2017","unstructured":"Deng J, Xu X, Zhang Z, Fr\u00fchholz S, Schuller B (2017d) Universum autoencoder-based domain adaptation for speech emotion recognition. IEEE Signal Process Lett 24:500\u2013504","journal-title":"IEEE Signal Process Lett"},{"key":"9874_CR38","doi-asserted-by":"publisher","first-page":"31","DOI":"10.1109\/taslp.2017.2759338","volume":"26","author":"J Deng","year":"2018","unstructured":"Deng J, Xu XZ, Zhang ZX, Fruhholz S, Schuller B (2018) Semisupervised Autoencoders for Speech Emotion Recognition. IEEE-ACM Trans Audio Speech Lang 26:31\u201343. https:\/\/doi.org\/10.1109\/taslp.2017.2759338","journal-title":"IEEE-ACM Trans Audio Speech Lang"},{"key":"9874_CR39","doi-asserted-by":"crossref","first-page":"5231","DOI":"10.1007\/s13369-017-2742-5","volume":"42","author":"M Deriche","year":"2017","unstructured":"Deriche M (2017) A Two-Stage Hierarchical Bilingual Emotion Recognition System Using a Hidden Markov Model and Neural Networks. Arab J Sci Eng 42:5231\u20135249","journal-title":"Arab J Sci Eng"},{"key":"9874_CR40","doi-asserted-by":"publisher","unstructured":"Deriche M, Abo absa AH (2017) A Two-Stage Hierarchical Bilingual Emotion Recognition System Using a Hidden Markov Model and Neural Networks, Arab J Sci.\\ Eng. 42:5231\u20135249. https:\/\/doi.org\/10.1007\/s13369-017-2742-5","DOI":"10.1007\/s13369-017-2742-5"},{"key":"9874_CR41","doi-asserted-by":"crossref","first-page":"78","DOI":"10.1145\/2347736.2347755","volume":"55","author":"P Domingos","year":"2012","unstructured":"Domingos P (2012) A few useful things to know about machine learning. Commun ACM 55:78\u201387","journal-title":"Commun ACM"},{"key":"9874_CR42","first-page":"2121","volume":"12","author":"J Duchi","year":"2011","unstructured":"Duchi J, Hazan E, Singer Y (2011) Adaptive subgradient methods for online learning and stochastic optimization. J Mach Learn Res 12:2121\u20132159","journal-title":"J Mach Learn Res"},{"key":"9874_CR43","doi-asserted-by":"crossref","first-page":"572","DOI":"10.1016\/j.patcog.2010.09.020","volume":"44","author":"M El Ayadi","year":"2011","unstructured":"El Ayadi M, Kamel MS, Karray F (2011) Survey on speech emotion recognition: features, classification schemes, and databases. Pattern Recogn 44:572\u2013587","journal-title":"Pattern Recogn"},{"key":"9874_CR44","doi-asserted-by":"crossref","unstructured":"Endah SN, Widodo AP, Fariq ML, Nadianada SI, Maulana F (2017). Beyond back-propagation learning for diabetic detection: Convergence comparison of gradient descent, momentum and Adaptive Learning Rate. In: 2017 1st International Conference on Informatics and Computational Sciences (ICICoS). IEEE, pp 189\u2013194","DOI":"10.1109\/ICICOS.2017.8276360"},{"key":"9874_CR45","doi-asserted-by":"crossref","first-page":"121","DOI":"10.1016\/j.patcog.2016.03.028","volume":"58","author":"SM Erfani","year":"2016","unstructured":"Erfani SM, Rajasegarar S, Karunasekera S, Leckie C (2016) High-dimensional and large-scale anomaly detection using a linear one-class SVM with deep learning. Pattern Recogn 58:121\u2013134","journal-title":"Pattern Recogn"},{"key":"9874_CR46","doi-asserted-by":"crossref","unstructured":"Etienne C, Fidanza G, Petrovskii A, Devillers L, Schmauch B (2018). Speech Emotion Recognition with Data Augmentation and Layer-wise Learning Rate Adjustment arXiv preprint arXiv:180205630","DOI":"10.21437\/SMM.2018-5"},{"key":"9874_CR47","doi-asserted-by":"crossref","unstructured":"Eyben F, Weninger F, Gross F, Schuller B (2013). Recent developments in opensmile, the munich open-source multimedia feature extractor. In: Proceedings of the 21st ACM international conference on Multimedia. ACM, pp 835\u2013838","DOI":"10.1145\/2502081.2502224"},{"key":"9874_CR48","doi-asserted-by":"crossref","unstructured":"Eyben F, W\u00f6llmer M, Schuller B (2009). OpenEAR\u2014introducing the Munich open-source emotion and affect recognition toolkit. In: 2009 3rd international conference on affective computing and intelligent interaction and workshops. IEEE, pp 1\u20136","DOI":"10.1109\/ACII.2009.5349350"},{"key":"9874_CR49","doi-asserted-by":"crossref","first-page":"190","DOI":"10.1109\/TAFFC.2015.2457417","volume":"7","author":"F Eyben","year":"2015","unstructured":"Eyben F et al (2015) The Geneva minimalistic acoustic parameter set (GeMAPS) for voice research and affective computing. IEEE Trans Affect Comput 7:190\u2013202","journal-title":"IEEE Trans Affect Comput"},{"key":"9874_CR50","doi-asserted-by":"crossref","first-page":"60","DOI":"10.1016\/j.neunet.2017.02.013","volume":"92","author":"HM Fayek","year":"2017","unstructured":"Fayek HM, Lech M, Cavedon L (2017) Evaluating deep learning architectures for Speech Emotion Recognition. Neural Netw 92:60\u201368","journal-title":"Neural Netw"},{"key":"9874_CR51","doi-asserted-by":"crossref","unstructured":"Fei W, Ye X, Sun Z, Huang Y, Zhang X, Shang S (2016). Research on speech emotion recognition based on deep auto-encoder. In: 2016 IEEE International Conference on Cyber Technology in Automation, Control, and Intelligent Systems (CYBER). IEEE, pp 308\u2013312","DOI":"10.1109\/CYBER.2016.7574841"},{"key":"9874_CR52","doi-asserted-by":"crossref","unstructured":"Fonnegra RD, D\u00edaz GM (2018). Speech Emotion Recognition Based on a Recurrent Neural Network Classification Model. In, Cham. Advances in Computer Entertainment Technology. Springer International Publishing, pp 882\u2013892","DOI":"10.1007\/978-3-319-76270-8_59"},{"key":"9874_CR53","doi-asserted-by":"crossref","first-page":"829","DOI":"10.1109\/10.846676","volume":"47","author":"DJ France","year":"2000","unstructured":"France DJ, Shiavi RG, Silverman S, Silverman M, Wilkes M (2000) Acoustical properties of speech as indicators of depression and suicidal risk. IEEE Trans Biomed Eng 47:829\u2013837","journal-title":"IEEE Trans Biomed Eng"},{"key":"9874_CR54","doi-asserted-by":"crossref","unstructured":"Gers FA, Schmidhuber J, Cummins F (1999). Learning to forget: continual prediction with LSTM","DOI":"10.1049\/cp:19991218"},{"key":"9874_CR55","doi-asserted-by":"crossref","unstructured":"Ghosh S, Laksana E, Morency L-P, Scherer S (2016a). Representation learning for speech emotion recognition. In: Interspeech. pp. 3603\u20133607","DOI":"10.21437\/Interspeech.2016-692"},{"key":"9874_CR56","doi-asserted-by":"publisher","unstructured":"Ghosh S, Laksana E, Morency LP, Scherer S, Int Speech Commun A (2016b). Representation Learning for Speech Emotion Recognition. In: 17th Annual Conference of the International Speech Communication Association. Interspeech. Isca-Int Speech Communication Assoc, Baixas, pp 3603\u20133607. doi:https:\/\/doi.org\/10.21437\/Interspeech.2016-692","DOI":"10.21437\/Interspeech.2016-692"},{"key":"9874_CR57","doi-asserted-by":"crossref","first-page":"e0144610","DOI":"10.1371\/journal.pone.0144610","volume":"10","author":"T Giannakopoulos","year":"2015","unstructured":"Giannakopoulos T (2015) Pyaudioanalysis: An open-source python library for audio signal analysis. PLoS One 10:e0144610","journal-title":"PLoS One"},{"key":"9874_CR58","unstructured":"Gjoreski M, Gjoreski H, Kulakov A (n.d.). Automatic recognition of emotions from speech"},{"key":"9874_CR59","first-page":"5","volume":"3","author":"A Gretton","year":"2009","unstructured":"Gretton A, Smola A, Huang J, Schmittfull M, Borgwardt K, Sch\u00f6lkopf B (2009) Covariate shift by kernel mean matching. Dataset shift in machine learning 3:5","journal-title":"Dataset shift in machine learning"},{"key":"9874_CR60","unstructured":"Gulli A, Pal S (2017). Deep learning with Keras. Packt Publishing Ltd,"},{"key":"9874_CR61","first-page":"22","volume":"101","author":"T Gulzar","year":"2014","unstructured":"Gulzar T, Singh A, Sharma S (2014) Comparative analysis of LPCC, MFCC and BFCC for the recognition of Hindi words using artificial neural networks. Int J Comput Appl 101:22\u201327","journal-title":"Int J Comput Appl"},{"key":"9874_CR62","doi-asserted-by":"publisher","first-page":"27","DOI":"10.1016\/j.neucom.2015.09.116","volume":"187","author":"Y Guo","year":"2016","unstructured":"Guo Y, Liu Y, Oerlemans A, Lao S, Wu S, Lew MS (2016) Deep learning for visual understanding: A review. Neurocomputing 187:27\u201348. https:\/\/doi.org\/10.1016\/j.neucom.2015.09.116","journal-title":"Neurocomputing"},{"key":"9874_CR63","doi-asserted-by":"crossref","unstructured":"Gupta D, Bansal P, Choudhary K (2018). The state of the art of feature extraction techniques in speech recognition. In: Speech and language processing for human-machine communications. Springer, pp. 195\u2013207","DOI":"10.1007\/978-981-10-6626-9_22"},{"key":"9874_CR64","doi-asserted-by":"crossref","unstructured":"Hajarolasvadi N, Demirel H (2019). 3D CNN-based speech emotion recognition using K-means clustering and spectrograms entropy 21:479","DOI":"10.3390\/e21050479"},{"key":"9874_CR65","doi-asserted-by":"crossref","first-page":"10","DOI":"10.1145\/1656274.1656278","volume":"11","author":"M Hall","year":"2009","unstructured":"Hall M, Frank E, Holmes G, Pfahringer B, Reutemann P, Witten IH (2009) The WEKA data mining software: an update. ACM SIGKDD explorations newsletter 11:10\u201318","journal-title":"ACM SIGKDD explorations newsletter"},{"key":"9874_CR66","doi-asserted-by":"crossref","unstructured":"Hansen JH, Bou-Ghazale SE (1997). Getting started with SUSAS: A speech under simulated and actual stress database. In: Fifth European Conference on Speech Communication and Technology","DOI":"10.21437\/Eurospeech.1997-494"},{"key":"9874_CR67","doi-asserted-by":"crossref","first-page":"391","DOI":"10.1016\/0167-6393(95)00007-B","volume":"16","author":"JH Hansen","year":"1995","unstructured":"Hansen JH, Cairns DA (1995) Icarus: source generator based real-time recognition of speech in noisy stressful and lombard effect environments. Speech Comm 16:391\u2013422","journal-title":"Speech Comm"},{"key":"9874_CR68","doi-asserted-by":"crossref","unstructured":"Haq S, Jackson PJ (2011). Multimodal emotion recognition. In: machine audition: principles, algorithms and systems. IGI global, pp 398-423","DOI":"10.4018\/978-1-61520-919-4.ch017"},{"key":"9874_CR69","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2015). Delving deep into rectifiers: surpassing human-level performance on imagenet classification. In: Proceedings of the IEEE international conference on computer vision. pp. 1026\u20131034","DOI":"10.1109\/ICCV.2015.123"},{"key":"9874_CR70","doi-asserted-by":"crossref","first-page":"e0220386","DOI":"10.1371\/journal.pone.0220386","volume":"14","author":"P Heracleous","year":"2019","unstructured":"Heracleous P, Yoneyama A (2019) A comprehensive study on bilingual and multilingual speech emotion recognition using a two-pass classification scheme. PloS one 14:e0220386","journal-title":"PloS one"},{"key":"9874_CR71","doi-asserted-by":"crossref","unstructured":"Hershey S et al. (2017). CNN architectures for large-scale audio classification. In: 2017 ieee international conference on acoustics, speech and signal processing (icassp). IEEE, pp 131\u2013135","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"9874_CR72","doi-asserted-by":"crossref","unstructured":"Hinton GE (2012). A practical guide to training restricted Boltzmann machines. In: neural networks: tricks of the trade. Springer, pp 599-619","DOI":"10.1007\/978-3-642-35289-8_32"},{"key":"9874_CR73","doi-asserted-by":"crossref","first-page":"1527","DOI":"10.1162\/neco.2006.18.7.1527","volume":"18","author":"GE Hinton","year":"2006","unstructured":"Hinton GE, Osindero S, Teh Y-W (2006) A fast learning algorithm for deep belief nets. Neural Comput 18:1527\u20131554","journal-title":"Neural Comput"},{"key":"9874_CR74","doi-asserted-by":"crossref","first-page":"504","DOI":"10.1126\/science.1127647","volume":"313","author":"GE Hinton","year":"2006","unstructured":"Hinton GE, Salakhutdinov RR (2006) Reducing the dimensionality of data with neural networks. Science 313:504\u2013507","journal-title":"Science"},{"key":"9874_CR75","doi-asserted-by":"crossref","unstructured":"Hinton G et al. (2012). Deep neural networks for acoustic modeling in speech recognition IEEE Signal processing magazine 29","DOI":"10.1109\/MSP.2012.2205597"},{"key":"9874_CR76","doi-asserted-by":"crossref","unstructured":"Ho N-H, Yang H-J, Kim S-H, Lee GJIA (2020). Multimodal Approach of Speech Emotion Recognition Using Multi-Level Multi-Head Fusion Attention-Based Recurrent Neural Network 8:61672\u201361686","DOI":"10.1109\/ACCESS.2020.2984368"},{"key":"9874_CR77","doi-asserted-by":"crossref","first-page":"69","DOI":"10.1016\/j.inffus.2018.09.008","volume":"49","author":"MS Hossain","year":"2019","unstructured":"Hossain MS, Muhammad G (2019) Emotion recognition using deep learning approach from audio\u2013visual emotional big data. Information Fusion 49:69\u201378","journal-title":"Information Fusion"},{"key":"9874_CR78","doi-asserted-by":"crossref","unstructured":"Huang C, Gong W, Fu W, Feng D (2014a). A research of speech emotion recognition based on deep belief network and SVM Mathematical Problems in Engineering 2014","DOI":"10.1155\/2014\/749604"},{"key":"9874_CR79","doi-asserted-by":"crossref","unstructured":"Huang Y, Hu M, Yu X, Wang T, Yang C Transfer Learning of Deep Neural Network for Speech Emotion Recognition. In, Singapore, 2016a. Pattern recognition. Springer Singapore, pp 721\u2013729","DOI":"10.1007\/978-981-10-3005-5_59"},{"key":"9874_CR80","doi-asserted-by":"crossref","first-page":"1787","DOI":"10.1007\/s12652-017-0644-8","volume":"10","author":"Y Huang","year":"2019","unstructured":"Huang Y, Tian K, Wu A, Zhang G (2019) Feature fusion methods research based on deep belief networks for speech emotion recognition under noise condition. J Ambient Intell Humaniz Comput 10:1787\u20131798","journal-title":"J Ambient Intell Humaniz Comput"},{"key":"9874_CR81","doi-asserted-by":"crossref","unstructured":"Huang Y, Wu A, Zhang G, Li Y (2014b). Speech emotion recognition based on coiflet wavelet packet cepstral coefficients. In: Chinese conference on pattern recognition. Springer, pp 436\u2013443","DOI":"10.1007\/978-3-662-45643-9_46"},{"key":"9874_CR82","first-page":"28.21","volume":"17","author":"Y Huang","year":"2016","unstructured":"Huang Y, Wu A, Zhang G, Li Y (2016b) Speech emotion recognition based on deep belief networks and wavelet packet cepstral coefficients international journal of simulation: systems. Sci Technol 17:28.21\u201328.25","journal-title":"Sci Technol"},{"key":"9874_CR83","doi-asserted-by":"publisher","first-page":"6785","DOI":"10.1007\/s11042-016-3354-x","volume":"76","author":"Z Huang","year":"2017","unstructured":"Huang Z, Xue W, Mao Q, Zhan Y (2017) Unsupervised domain adaptation for speech emotion recognition using PCANet. Multimed Tools Appl 76:6785\u20136799. https:\/\/doi.org\/10.1007\/s11042-016-3354-x","journal-title":"Multimed Tools Appl"},{"key":"9874_CR84","doi-asserted-by":"crossref","unstructured":"Hussain N, Khan MA, Sharif M, Khan SA, Albesher AA, Saba T, Armaghan AJMTAhdos (2020). A deep neural network and classical features based scheme for objects recognition: an application for machine inspection","DOI":"10.1007\/s11042-020-08852-3"},{"key":"9874_CR85","doi-asserted-by":"crossref","unstructured":"Ide H, Kurita T (2017). Improvement of learning for CNN with ReLU activation by sparse regularization. In: 2017 International Joint Conference on Neural Networks (IJCNN). IEEE, pp 2684\u20132691","DOI":"10.1109\/IJCNN.2017.7966185"},{"key":"9874_CR86","unstructured":"Ioffe S, Szegedy C (2015). Batch normalization: Accelerating deep network training by reducing internal covariate shift arXiv preprint arXiv:150203167"},{"key":"9874_CR87","doi-asserted-by":"crossref","unstructured":"Jarchi D, Andreu-Perez J, Kiani M, Vysata O, Kuchynka J, Prochazka A, Sanei SJS (2020). Recognition of Patient Groups with Sleep Related Disorders using Bio-signal Processing and Deep Learning 20:2594","DOI":"10.3390\/s20092594"},{"key":"9874_CR88","doi-asserted-by":"crossref","unstructured":"Jia Y et al. (2014). Caffe: Convolutional architecture for fast feature embedding. In: Proceedings of the 22nd ACM international conference on Multimedia. ACM, pp 675\u2013678","DOI":"10.1145\/2647868.2654889"},{"key":"9874_CR89","doi-asserted-by":"crossref","first-page":"1434","DOI":"10.3390\/s17061434","volume":"17","author":"Y Jian","year":"2017","unstructured":"Jian Y et al (2017) A novel extreme learning machine classification model for e-Nose application based on the multiple kernel approach. Sensors 17:1434","journal-title":"Sensors"},{"key":"9874_CR90","doi-asserted-by":"crossref","first-page":"2730","DOI":"10.3390\/s19122730","volume":"19","author":"W Jiang","year":"2019","unstructured":"Jiang W, Wang Z, Jin JS, Han X, Li C (2019) Speech Emotion Recognition with Heterogeneous Feature Unification of Deep Neural Network. Sensors 19:2730","journal-title":"Sensors"},{"key":"9874_CR91","unstructured":"Kaiser JF (1990). On a simple algorithm to calculate the\u2019energy\u2019of a signal. In: International conference on acoustics, speech, and signal processing. IEEE, pp 381\u2013384"},{"key":"9874_CR92","unstructured":"Kerkeni L, Serrestou Y, Mbarki M, Mahjoub MA, Raoof K, Cl\u00e9der C (2017). Speech emotion recognition: recurrent neural networks compared to SVM and linear regression"},{"key":"9874_CR93","doi-asserted-by":"crossref","unstructured":"Keyvanrad MA, Homayounpour MM (2014). A brief survey on deep belief networks and introducing a new object oriented toolbox (DeeBNet) arXiv preprint arXiv:14083264","DOI":"10.1109\/IJCNN.2015.7280688"},{"key":"9874_CR94","doi-asserted-by":"crossref","unstructured":"Khalid S, Muhammad N, Sharif MJIITS. (2018) Automatic measurement of the traffic sign with digital segmentation and recognition 13:269\u2013279","DOI":"10.1049\/iet-its.2018.5223"},{"key":"9874_CR95","doi-asserted-by":"crossref","unstructured":"Khan H, Sharif M, Bibi N, Muhammad NJTEPJP (2019). A novel algorithm for the detection of cerebral aneurysm using sub-band morphological operation 134:34","DOI":"10.1140\/epjp\/i2019-12432-6"},{"key":"9874_CR96","doi-asserted-by":"crossref","unstructured":"Khan MA et al. (2020). Human action recognition using fusion of multiview and deep features: an application to video surveillance:1\u201327","DOI":"10.1007\/s11042-020-08806-9"},{"key":"9874_CR97","unstructured":"Kingma DP, Ba J (2014). Adam: A method for stochastic optimization arXiv preprint arXiv:14126980"},{"key":"9874_CR98","doi-asserted-by":"crossref","first-page":"401","DOI":"10.3390\/s18020401","volume":"18","author":"B Ko","year":"2018","unstructured":"Ko B (2018) A brief review of facial emotion recognition based on visual information. Sensors 18:401","journal-title":"Sensors"},{"key":"9874_CR99","doi-asserted-by":"publisher","first-page":"29","DOI":"10.1016\/j.procs.2015.10.020","volume":"70","author":"S Lalitha","year":"2015","unstructured":"Lalitha S, Geyasruti D, Narayanan RMS (2015) Emotion Detection Using MFCC and Cepstrum Features. Prog Comput Sci 70:29\u201335. https:\/\/doi.org\/10.1016\/j.procs.2015.10.020","journal-title":"Prog Comput Sci"},{"key":"9874_CR100","first-page":"92","volume":"1","author":"CP Latha","year":"2016","unstructured":"Latha CP, Priya M (2016) A review on deep learning algorithms for speech and facial emotion recognition APTIKOM. Electron J Comput Sci Inf Technol 1:92\u2013108","journal-title":"Electron J Comput Sci Inf Technol"},{"key":"9874_CR101","doi-asserted-by":"crossref","unstructured":"Laydrus NC, Ambikairajah E, Celler B (2007). Automated sound analysis system for home telemonitoring using shifted delta cepstral features. In: 2007 15th International Conference on Digital Signal Processing. IEEE, pp 135\u2013138","DOI":"10.1109\/ICDSP.2007.4288537"},{"key":"9874_CR102","doi-asserted-by":"publisher","unstructured":"Le D, Provost EM (2015). Data selection for acoustic emotion recognition: Analyzing and comparing utterance and sub-utterance selection strategies. In: 2015 International Conference on Affective Computing and Intelligent Interaction (ACII), 21\u201324 Sept. 2015. pp 146\u2013152. doi:https:\/\/doi.org\/10.1109\/ACII.2015.7344564","DOI":"10.1109\/ACII.2015.7344564"},{"key":"9874_CR103","doi-asserted-by":"publisher","first-page":"436","DOI":"10.1038\/nature14539","volume":"521","author":"Y LeCun","year":"2015","unstructured":"LeCun Y, Bengio Y, Hinton G (2015) Deep learning. Nature 521:436\u2013444. https:\/\/doi.org\/10.1038\/nature14539","journal-title":"Nature"},{"key":"9874_CR104","doi-asserted-by":"crossref","first-page":"293","DOI":"10.1109\/TSA.2004.838534","volume":"13","author":"CM Lee","year":"2005","unstructured":"Lee CM, Narayanan SS (2005) Toward detecting emotions in spoken dialogs. IEEE transactions on speech and audio processing 13:293\u2013303","journal-title":"IEEE transactions on speech and audio processing"},{"key":"9874_CR105","doi-asserted-by":"crossref","unstructured":"Lee J, Tashev I (2015). High-level feature representation using recurrent neural network for speech emotion recognition. In: Sixteenth Annual Conference of the International Speech Communication Association","DOI":"10.21437\/Interspeech.2015-336"},{"key":"9874_CR106","doi-asserted-by":"crossref","first-page":"119","DOI":"10.1016\/j.neucom.2015.06.008","volume":"168","author":"C Li","year":"2015","unstructured":"Li C, Sanchez R-V, Zurita G, Cerrada M, Cabrera D, V\u00e1squez RE (2015) Multimodal deep support vector classification with homologous features and its application to gearbox fault diagnosis. Neurocomputing 168:119\u2013127","journal-title":"Neurocomputing"},{"key":"9874_CR107","doi-asserted-by":"crossref","first-page":"271","DOI":"10.1016\/j.neucom.2017.07.050","volume":"273","author":"Z-T Liu","year":"2018","unstructured":"Liu Z-T, Wu M, Cao W-H, Mao J-W, Xu J-P, Tan G-Z (2018) Speech emotion recognition based on feature selection and extreme learning machine decision tree. Neurocomputing 273:271\u2013280","journal-title":"Neurocomputing"},{"key":"9874_CR108","doi-asserted-by":"crossref","first-page":"46","DOI":"10.1016\/j.csl.2016.03.001","volume":"40","author":"I Lopez-Moreno","year":"2016","unstructured":"Lopez-Moreno I, Gonzalez-Dominguez J, Martinez D, Plchot O, Gonzalez-Rodriguez J, Moreno PJ (2016) On the use of deep feedforward neural networks for automatic language identification. Comput Speech Lang 40:46\u201359","journal-title":"Comput Speech Lang"},{"key":"9874_CR109","unstructured":"Lyons J (2013). Python speech features. https:\/\/github.com\/jameslyons\/python_speech_features. Accessed 16-03-2017 2017"},{"key":"9874_CR110","doi-asserted-by":"crossref","first-page":"485","DOI":"10.1016\/j.aej.2016.09.002","volume":"56","author":"K Mannepalli","year":"2017","unstructured":"Mannepalli K, Sastry PN, Suman M (2017) A novel adaptive fractional deep belief networks for speaker emotion recognition. Alex Eng J 56:485\u2013497","journal-title":"Alex Eng J"},{"key":"9874_CR111","doi-asserted-by":"crossref","first-page":"779","DOI":"10.1007\/s10772-016-9368-y","volume":"19","author":"K Mannepalli","year":"2016","unstructured":"Mannepalli K, Sastry PN, Suman M (2016) FDBN: Design and development of Fractional Deep Belief Networks for speaker emotion recognition. Int J Speech Technol 19:779\u2013790","journal-title":"Int J Speech Technol"},{"key":"9874_CR112","doi-asserted-by":"crossref","first-page":"178","DOI":"10.1016\/j.comcom.2016.03.010","volume":"89","author":"LY Mano","year":"2016","unstructured":"Mano LY et al (2016) Exploiting IoT technologies for enhancing health smart homes through patient identification and emotion recognition. Comput Commun 89:178\u2013190","journal-title":"Comput Commun"},{"key":"9874_CR113","doi-asserted-by":"publisher","unstructured":"Manolov A, Boumbarov O, Manolova A, Poulkov V, Tonchev K (2017). Feature selection in affective speech classification. In: 2017 40th international conference on telecommunications and signal processing, TSP 2017. pp. 354\u2013358. doi:https:\/\/doi.org\/10.1109\/TSP.2017.8076004","DOI":"10.1109\/TSP.2017.8076004"},{"key":"9874_CR114","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/j.specom.2017.06.006","volume":"93","author":"Q Mao","year":"2017","unstructured":"Mao Q, Xu G, Xue W, Gou J, Zhan Y (2017) Learning emotion-discriminative and domain-invariant features for domain adaptation in speech emotion recognition. Speech Comm 93:1\u201310","journal-title":"Speech Comm"},{"key":"9874_CR115","doi-asserted-by":"crossref","unstructured":"Martin O, Kotsia I, Macq B, Pitas I (2006). The eNTERFACE'05 audio-visual emotion database. In: 22nd International Conference on Data Engineering Workshops (ICDEW\u201906). IEEE, pp 8\u20138","DOI":"10.1109\/ICDEW.2006.145"},{"key":"9874_CR116","doi-asserted-by":"crossref","unstructured":"McCormick C (2014). Deep Learning Tutorial - Softmax Regression. http:\/\/mccormickml.com\/2014\/06\/13\/deep-learning-tutorial-softmax-regression\/. Accessed 13 Jun 2014","DOI":"10.1201\/b17103-3"},{"key":"9874_CR117","doi-asserted-by":"crossref","unstructured":"McFee B, Raffel C, Liang D, Ellis DP, McVicar M, Battenberg E, Nieto O (2015). Librosa: Audio and music signal analysis in python. In: Proceedings of the 14th python in science conference","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"9874_CR118","unstructured":"McLoughlin IV, Chance R (1997). LSP-based speech modification for intelligibility enhancement. In: Proceedings of 13th International Conference on Digital Signal Processing. IEEE, pp 591\u2013594"},{"key":"9874_CR119","doi-asserted-by":"crossref","first-page":"72845","DOI":"10.1109\/ACCESS.2018.2881096","volume":"6","author":"AH Meftah","year":"2018","unstructured":"Meftah AH, Alotaibi YA, Selouani S-A (2018) Evaluation of an Arabic speech corpus of emotions: A perceptual and statistical analysis. IEEE Access 6:72845\u201372861","journal-title":"IEEE Access"},{"key":"9874_CR120","doi-asserted-by":"publisher","unstructured":"Meftah A, Alotaibi Y, Selouani S (2016). Emotional speech recognition: A multilingual perspective. In: 2016 International Conference on Bio-engineering for Smart Technologies (BioSMART), 4\u20137 Dec. 2016. pp 1\u20134. doi:https:\/\/doi.org\/10.1109\/BIOSMART.2016.7835600","DOI":"10.1109\/BIOSMART.2016.7835600"},{"key":"9874_CR121","doi-asserted-by":"crossref","unstructured":"Mehmood A et al. (2020). Prosperous human gait recognition: an end-to-end system based on pre-trained CNN features selection","DOI":"10.1007\/s11042-020-08928-0"},{"key":"9874_CR122","doi-asserted-by":"crossref","first-page":"416","DOI":"10.3390\/s18020416","volume":"18","author":"D Mehta","year":"2018","unstructured":"Mehta D, Siddiqui M, Javaid A (2018) Facial emotion recognition: A survey and real-world user experiences in mixed reality. Sensors 18:416","journal-title":"Sensors"},{"key":"9874_CR123","doi-asserted-by":"crossref","first-page":"125868","DOI":"10.1109\/ACCESS.2019.2938007","volume":"7","author":"H Meng","year":"2019","unstructured":"Meng H, Yan T, Yuan F, Wei H (2019) Speech emotion recognition from 3D log-Mel spectrograms with deep learning network. IEEE access 7:125868\u2013125881","journal-title":"IEEE access"},{"key":"9874_CR124","unstructured":"Mesnil G et al. (2011). Unsupervised and transfer learning challenge: a deep learning approach. In: Proceedings of the 2011 International Conference on Unsupervised and Transfer Learning workshop-Volume 27. JMLR. org, pp 97\u2013111"},{"key":"9874_CR125","doi-asserted-by":"crossref","unstructured":"Michel P, El Kaliouby R (2003). Real time facial expression recognition in video using support vector machines. In: Proceedings of the 5th international conference on Multimodal interfaces. ACM, pp 258\u2013264","DOI":"10.1145\/958432.958479"},{"key":"9874_CR126","unstructured":"MicroPyramid (2011) Understanding Audio Quality: Bit Rate, Sample Rate. https:\/\/micropyramid.com\/blog\/understanding-audio-quality-bit-rate-sample-rate\/. 2011"},{"key":"9874_CR127","doi-asserted-by":"crossref","unstructured":"Milton A, Roy SS, Selvi ST (2013). SVM scheme for speech emotion recognition using MFCC feature international journal of computer applications 69","DOI":"10.5120\/11872-7667"},{"key":"9874_CR128","doi-asserted-by":"crossref","unstructured":"Mishra AN, Shrotriya M, Sharan S (2010). Comparative wavelet, PLP, and LPC speech recognition techniques on the Hindi speech digits database. In: Second International Conference on Digital Image Processing. International Society for Optics and Photonics, p 754634","DOI":"10.1117\/12.856318"},{"key":"9874_CR129","unstructured":"Molchanov D, Ashukha A, Vetrov D (2017). Variational dropout sparsifies deep neural networks. In: Proceedings of the 34th International Conference on Machine Learning-Volume 70. JMLR. org, pp 2498\u20132507"},{"key":"9874_CR130","doi-asserted-by":"crossref","first-page":"98","DOI":"10.1016\/j.specom.2006.11.004","volume":"49","author":"D Morrison","year":"2007","unstructured":"Morrison D, Wang R, De Silva LC (2007) Ensemble methods for spoken emotion recognition in call-centres. Speech Comm 49:98\u2013112","journal-title":"Speech Comm"},{"key":"9874_CR131","doi-asserted-by":"crossref","unstructured":"Mu Y, G\u00f3mez LAH, Montes AC, MART\u00cdNEZ CA, Wang X, Gao H (2017). Speech emotion recognition using convolutional-recurrent neural networks with attention model DEStech transactions on computer science and engineering","DOI":"10.12783\/dtcse\/cii2017\/17273"},{"key":"9874_CR132","unstructured":"Muda L, Begam M, Elamvazuthi I (2010). Voice recognition algorithms using mel frequency cepstral coefficient (MFCC) and dynamic time warping (DTW) techniques arXiv preprint arXiv:10034083"},{"key":"9874_CR133","doi-asserted-by":"crossref","unstructured":"Mukherjee H, Dhar A, Obaidullah SM, Phadikar S, Roy KJMT, Applications (2020). Image-based features for speech signal classification:1\u201317","DOI":"10.1201\/9780429277573-1"},{"key":"9874_CR134","doi-asserted-by":"crossref","first-page":"1097","DOI":"10.1121\/1.405558","volume":"93","author":"IR Murray","year":"1993","unstructured":"Murray IR, Arnott JL (1993) Toward the simulation of emotion in synthetic speech: A review of the literature on human vocal emotion. J Acoust Soc Am 93:1097\u20131108","journal-title":"J Acoust Soc Am"},{"key":"9874_CR135","doi-asserted-by":"crossref","unstructured":"Naz I, Muhammad N, Yasmin M, Sharif M, Shah JH, Fernandes SLJJoMiM, Biology (2019). Robust discrimination of leukocytes protuberant types for early diagnosis of leukemia 19:1950055","DOI":"10.1142\/S0219519419500556"},{"key":"9874_CR136","doi-asserted-by":"crossref","unstructured":"Neiberg D, Elenius K, Laskowski K (2006). Emotion recognition in spontaneous speech using GMMs. In: Ninth international conference on spoken language processing","DOI":"10.21437\/Interspeech.2006-277"},{"key":"9874_CR137","doi-asserted-by":"crossref","unstructured":"Neumann M, Vu NT (2017). Attentive convolutional neural network based speech emotion recognition: A study on the impact of input features, signal length, and acted speech arXiv preprint arXiv:170600612","DOI":"10.21437\/Interspeech.2017-917"},{"key":"9874_CR138","unstructured":"Ng A (2017). Improving deep neural networks: Hyperparameter tuning, regularization and optimization Deeplearning ai on Coursera"},{"key":"9874_CR139","doi-asserted-by":"crossref","first-page":"603","DOI":"10.1016\/S0167-6393(03)00099-2","volume":"41","author":"TL Nwe","year":"2003","unstructured":"Nwe TL, Foo SW, De Silva LC (2003) Speech emotion recognition using hidden Markov models. Speech Comm 41:603\u2013623","journal-title":"Speech Comm"},{"key":"9874_CR140","doi-asserted-by":"crossref","first-page":"233","DOI":"10.1016\/j.eswa.2018.03.056","volume":"105","author":"HF Nweke","year":"2018","unstructured":"Nweke HF, Teh YW, Al-Garadi MA, Alo UR (2018) Deep learning algorithms for human activity recognition using mobile and wearable sensor networks: State of the art and research challenges. Expert Systems with Applications 105:233\u2013261","journal-title":"Expert Systems with Applications"},{"key":"9874_CR141","doi-asserted-by":"crossref","unstructured":"Pannu HS, Ahuja S, Dang N, Soni S, Malhi AKJMT, APPLICATIONS (2020). Deep learning based image classification for intestinal hemorrhage","DOI":"10.1007\/s11042-020-08905-7"},{"key":"9874_CR142","doi-asserted-by":"crossref","unstructured":"Papakostas M, Siantikos G, Giannakopoulos T, Spyrou E, Sgouropoulos D (2017a). Recognizing emotional states using speech information. In: GeNeDis 2016. Springer, pp 155-164","DOI":"10.1007\/978-3-319-57348-9_13"},{"key":"9874_CR143","doi-asserted-by":"crossref","unstructured":"Papakostas M, Spyrou E, Giannakopoulos T, Siantikos G, Sgouropoulos D, Mylonas P, Makedon F (2017b). Deep visual attributes vs. hand-crafted audio features on multidomain speech emotion recognition computation 5:26","DOI":"10.3390\/computation5020026"},{"key":"9874_CR144","doi-asserted-by":"crossref","unstructured":"Partila P, Voznak M, Tovarek J (2015a). Pattern recognition methods and features selection for speech emotion recognition system The Scientific World Journal 2015","DOI":"10.1155\/2015\/573068"},{"key":"9874_CR145","doi-asserted-by":"publisher","first-page":"573068","DOI":"10.1155\/2015\/573068","volume":"2015","author":"P Partila","year":"2015","unstructured":"Partila P, Voznak M, Tovarek J (2015b) Pattern Recognition Methods and Features Selection for Speech Emotion Recognition System. TheScientificWorldJournal 2015:573068\u2013573067. https:\/\/doi.org\/10.1155\/2015\/573068","journal-title":"TheScientificWorldJournal"},{"key":"9874_CR146","doi-asserted-by":"crossref","first-page":"814","DOI":"10.1016\/j.specom.2012.02.002","volume":"54","author":"E Pavez","year":"2012","unstructured":"Pavez E, Silva JF (2012) Analysis and design of wavelet-packet cepstral coefficients for automatic speech recognition. Speech Comm 54:814\u2013835","journal-title":"Speech Comm"},{"key":"9874_CR147","doi-asserted-by":"crossref","unstructured":"Picard RW, Vyzas E, Healey J (2001). Toward machine emotional intelligence: Analysis of affective physiological state IEEE Transactions on Pattern Analysis & Machine Intelligence:1175\u20131191","DOI":"10.1109\/34.954607"},{"key":"9874_CR148","doi-asserted-by":"crossref","first-page":"295","DOI":"10.1007\/s11071-009-9649-y","volume":"61","author":"ES Pires","year":"2010","unstructured":"Pires ES, Machado JT, de Moura OP, Cunha JB, Mendes L (2010) Particle swarm optimization with fractional-order velocity. Nonlinear Dyn 61:295\u2013301","journal-title":"Nonlinear Dyn"},{"key":"9874_CR149","doi-asserted-by":"crossref","first-page":"42","DOI":"10.1016\/j.knosys.2016.06.009","volume":"108","author":"S Poria","year":"2016","unstructured":"Poria S, Cambria E, Gelbukh A (2016) Aspect extraction for opinion mining with a deep convolutional neural network. Knowl.-Based Syst 108:42\u201349","journal-title":"Knowl.-Based Syst"},{"key":"9874_CR150","unstructured":"Povey D et al. (2011). The Kaldi speech recognition toolkit. In: IEEE 2011 Workshop on automatic speech recognition and understanding, 2011. vol CONF. IEEE Signal Processing Society,"},{"key":"9874_CR151","unstructured":"Prabhakar OP, Sahu NK (2013). A survey on: voice command recognition technique international journal of advanced research in computer science and software engineering 3"},{"key":"9874_CR152","unstructured":"Rabiner LR (1978). Digital processing of speech signal digital processing of speech signal"},{"key":"9874_CR153","unstructured":"Rabiner LR, Gold B (1975). Theory and application of digital signal processing Englewood cliffs, NJ, prentice-Hall, Inc, 1975 777 p"},{"key":"9874_CR154","doi-asserted-by":"crossref","unstructured":"Raj RJS, Shobana SJ, Pustokhina IV, Pustokhin DA, Gupta D, Shankar KJIA (2020). Optimal Feature Selection-Based Medical Image Classification Using Deep Learning Model in Internet of Medical Things 8:58006\u201358017","DOI":"10.1109\/ACCESS.2020.2981337"},{"key":"9874_CR155","unstructured":"Ralph Abbey TH, and Tao Wang (2017). Methods of multinomial classification using support vector machines paper presented at the SAS\u00ae global forum, Orlando, Florida"},{"key":"9874_CR156","unstructured":"Rana R, Epps J, Jurdak R, Li X, Goecke R, Brereton M, Soar J (n.d.). Gated Recurrent Unit (GRU) for Emotion Classification from Noisy Speech"},{"key":"9874_CR157","doi-asserted-by":"crossref","unstructured":"Ranzato MA, Poultney C, Chopra S, Cun YL (2007). Efficient learning of sparse representations with an energy-based model. In: Advances in neural information processing systems. pp. 1137\u20131144","DOI":"10.7551\/mitpress\/7503.003.0147"},{"key":"9874_CR158","doi-asserted-by":"crossref","first-page":"72","DOI":"10.1109\/89.365379","volume":"3","author":"DA Reynolds","year":"1995","unstructured":"Reynolds DA, Rose RC (1995) Robust text-independent speaker identification using Gaussian mixture speaker models. IEEE transactions on speech and audio processing 3:72\u201383","journal-title":"IEEE transactions on speech and audio processing"},{"key":"9874_CR159","unstructured":"Rifai S, Vincent P, Muller X, Glorot X, Bengio Y (2011). Contractive auto-encoders: Explicit invariance during feature extraction. In: Proceedings of the 28th International Conference on International Conference on Machine Learning. Omnipress, pp 833\u2013840"},{"key":"9874_CR160","doi-asserted-by":"crossref","unstructured":"Roy T, Marwala T, Chakraverty SJMMiIS (2020). A Survey of Classification Techniques in Speech Emotion Recognition:33\u201348","DOI":"10.1002\/9781119585640.ch3"},{"key":"9874_CR161","unstructured":"Ruder S (2016). An overview of gradient descent optimization algorithms arXiv preprint arXiv:160904747"},{"key":"9874_CR162","unstructured":"Salakhutdinov R, Larochelle H (2010) Efficient learning of deep Boltzmann machines. In: Proceedings of the thirteenth international conference on artificial intelligence and statistics. pp. 693\u2013700"},{"key":"9874_CR163","doi-asserted-by":"crossref","unstructured":"Satt A, Rozenberg S, Hoory R (2017). Efficient emotion recognition from speech using deep learning on spectrograms. In: INTERSPEECH. pp. 1089\u20131093","DOI":"10.21437\/Interspeech.2017-200"},{"key":"9874_CR164","doi-asserted-by":"crossref","unstructured":"Schaul T et al. (2010). PyBrain Journal of Machine Learning Research 11:743\u2013746","DOI":"10.2217\/pgs.10.78"},{"key":"9874_CR165","doi-asserted-by":"crossref","first-page":"143","DOI":"10.1037\/0033-2909.99.2.143","volume":"99","author":"KR Scherer","year":"1986","unstructured":"Scherer KR (1986) Vocal affect expression: A review and a model for future research. Psychol Bull 99:143","journal-title":"Psychol Bull"},{"key":"9874_CR166","doi-asserted-by":"crossref","unstructured":"Schuller B, Rigoll G, Lang M (2004). Speech emotion recognition combining acoustic features and linguistic information in a hybrid support vector machine-belief network architecture. In: Acoustics, Speech, and Signal Processing, 2004. Proceedings.(ICASSP'04). IEEE International Conference on. IEEE, pp I-577","DOI":"10.1109\/ICASSP.2004.1326051"},{"key":"9874_CR167","doi-asserted-by":"crossref","unstructured":"Schuller B, Steidl S, Batliner A (2009). The interspeech 2009 emotion challenge. In: Tenth Annual Conference of the International Speech Communication Association","DOI":"10.21437\/Interspeech.2009-103"},{"key":"9874_CR168","unstructured":"Schuller B, Steidl S, Batliner A, Burkhardt F, Devillers L, M\u00fcller C, Narayanan SS The INTERSPEECH (2010). Paralinguistic challenge. In: Eleventh Annual Conference of the International Speech Communication Association, 2010"},{"key":"9874_CR169","doi-asserted-by":"crossref","unstructured":"Seide F, Agarwal A (2016). CNTK: Microsoft\u2019s open-source deep-learning toolkit. In: Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. ACM, pp 2135\u20132135","DOI":"10.1145\/2939672.2945397"},{"key":"9874_CR170","doi-asserted-by":"crossref","unstructured":"Severyn A, Moschitti A (2015). Twitter sentiment analysis with deep convolutional neural networks. In: Proceedings of the 38th International ACM SIGIR Conference on Research and Development in Information Retrieval. ACM, pp 959\u2013962","DOI":"10.1145\/2766462.2767830"},{"key":"9874_CR171","doi-asserted-by":"publisher","unstructured":"Sezgin M, Gunsel B, Karabulut Kurt G (2012a). Perceptual audio features for emotion detection EURASIP journal on audio, Speech, and Music Processing 2012 doi:https:\/\/doi.org\/10.1186\/1687-4722-2012-16","DOI":"10.1186\/1687-4722-2012-16"},{"key":"9874_CR172","doi-asserted-by":"crossref","first-page":"26","DOI":"10.1016\/j.specom.2014.09.002","volume":"67","author":"C Sezgin","year":"2015","unstructured":"Sezgin C, Gunsel B, Krajewski J (2015) Medium term speaker state detection by perceptually masked spectral features. Speech Comm 67:26\u201341","journal-title":"Speech Comm"},{"key":"9874_CR173","doi-asserted-by":"crossref","first-page":"16","DOI":"10.1186\/1687-4722-2012-16","volume":"2012","author":"MC Sezgin","year":"2012","unstructured":"Sezgin MC, Gunsel B, Kurt GK (2012b) Perceptual audio features for emotion detection EURASIP journal on audio. Speech, and Music Processing 2012:16","journal-title":"Speech, and Music Processing"},{"key":"9874_CR174","unstructured":"Shaburov V, Monastyrshyn Y (2017). Emotion recognition in video conferencing. Google Patents,"},{"key":"9874_CR175","unstructured":"Shahsavarani S (2018). Speech emotion recognition using convolutional neural networks"},{"key":"9874_CR176","unstructured":"Shami MT, Kamel MS (2005). Segment-based approach to the recognition of emotions in speech. In: 2005 IEEE International Conference on Multimedia and Expo. IEEE, p 4 pp."},{"key":"9874_CR177","doi-asserted-by":"crossref","first-page":"16195","DOI":"10.1007\/s11042-018-7030-1","volume":"78","author":"M Sharma","year":"2019","unstructured":"Sharma M, Jalal AS, Khan A (2019) Emotion recognition using facial expression by fusing key points descriptor and texture features. Multimed Tools Appl 78:16195\u201316219","journal-title":"Multimed Tools Appl"},{"key":"9874_CR178","doi-asserted-by":"publisher","unstructured":"Sivanagaraja T, Ho MK, Khong AWH, Wang Y (2017). End-to-end speech emotion recognition using multi-scale convolution networks. In: 2017 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC), 12\u201315 Dec. 2017. pp 189\u2013192. doi:https:\/\/doi.org\/10.1109\/APSIPA.2017.8282026","DOI":"10.1109\/APSIPA.2017.8282026"},{"key":"9874_CR179","doi-asserted-by":"crossref","unstructured":"Soong F, Juang B (1984). Line spectrum pair (LSP) and speech data compression. In: ICASSP'84. IEEE International Conference on Acoustics, Speech, and Signal Processing. IEEE, pp 37\u201340","DOI":"10.1109\/ICASSP.1984.1172448"},{"key":"9874_CR180","doi-asserted-by":"publisher","unstructured":"Srikanth M, Pravena D, Govind D (2018a). Tamil speech emotion recognition using deep belief network(DBN) vol 678. doi:https:\/\/doi.org\/10.1007\/978-3-319-67934-1_29","DOI":"10.1007\/978-3-319-67934-1_29"},{"key":"9874_CR181","doi-asserted-by":"crossref","unstructured":"Srikanth M, Pravena D, Govind D (2018b). Tamil Speech Emotion Recognition Using Deep Belief Network(DBN). In, Cham. Advances in Signal Processing and Intelligent Recognition Systems. Springer International Publishing, pp 328\u2013336","DOI":"10.1007\/978-3-319-67934-1_29"},{"key":"9874_CR182","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava N, Hinton G, Krizhevsky A, Sutskever I, Salakhutdinov R (2014) Dropout: a simple way to prevent neural networks from overfitting. J Mach Learn Res 15:1929\u20131958","journal-title":"J Mach Learn Res"},{"key":"9874_CR183","volume-title":"Automatic classification of emotion related user states in spontaneous children\u2019s speech","author":"S Steidl","year":"2009","unstructured":"Steidl S (2009) Automatic classification of emotion related user states in spontaneous children\u2019s speech. University of Erlangen-Nuremberg Erlangen, Germany"},{"key":"9874_CR184","doi-asserted-by":"publisher","unstructured":"Stolar MN, Lech M, Bolia RS, Skinner M (2017). Real time speech emotion recognition using RGB image classification and transfer learning. In: 2017 11th International Conference on Signal Processing and Communication Systems (ICSPCS), 13\u201315 Dec. 2017. pp 1\u20138. doi:https:\/\/doi.org\/10.1109\/ICSPCS.2017.8270472","DOI":"10.1109\/ICSPCS.2017.8270472"},{"key":"9874_CR185","unstructured":"Sugiyama M, Nakajima S, Kashima H, Buenau PV, Kawanabe M (2008) Direct importance estimation with model selection and its application to covariate shift adaptation. In: Advances in neural information processing systems. pp. 1433\u20131440"},{"key":"9874_CR186","doi-asserted-by":"crossref","first-page":"931","DOI":"10.1007\/s10772-018-9551-4","volume":"21","author":"L Sun","year":"2018","unstructured":"Sun L, Chen J, Xie K, Gu T (2018) Deep and shallow features fusion based on deep convolutional neural network for speech emotion recognition. Int J Speech Technol 21:931\u2013940","journal-title":"Int J Speech Technol"},{"key":"9874_CR187","doi-asserted-by":"crossref","unstructured":"Sun R, Moore E (2011). Investigating glottal parameters and teager energy operators in emotion recognition. In: International Conference on Affective Computing and Intelligent Interaction. Springer, pp 425\u2013434","DOI":"10.1007\/978-3-642-24571-8_54"},{"key":"9874_CR188","doi-asserted-by":"publisher","unstructured":"Sunitha Ram C, Ponnusamy R (2014). An effective automatic speech emotion recognition for Tamil language based on DWT and MFCC using Stability-plasticity dilemma Neural network. In: 2014 International conference on information communication and embedded systems, ICICES, 2015. doi:https:\/\/doi.org\/10.1109\/ICICES.2014.7034102","DOI":"10.1109\/ICICES.2014.7034102"},{"key":"9874_CR189","unstructured":"Sutskever I, Vinyals O, Le QV. Sequence to sequence learning with neural networks. In: Advances in neural information processing systems, 2014. pp. 3104\u20133112"},{"key":"9874_CR190","doi-asserted-by":"crossref","first-page":"293","DOI":"10.1023\/A:1018628609742","volume":"9","author":"JA Suykens","year":"1999","unstructured":"Suykens JA, Vandewalle J (1999) Least squares support vector machine classifiers. Neural Process Lett 9:293\u2013300","journal-title":"Neural Process Lett"},{"key":"9874_CR191","doi-asserted-by":"publisher","first-page":"93","DOI":"10.1007\/s10772-018-9491-z","volume":"21","author":"M Swain","year":"2018","unstructured":"Swain M, Routray A, Kabisatpathy P (2018) Databases, features and classifiers for speech emotion recognition: a review. Int J Speech Technol 21:93\u2013120. https:\/\/doi.org\/10.1007\/s10772-018-9491-z","journal-title":"Int J Speech Technol"},{"key":"9874_CR192","doi-asserted-by":"crossref","unstructured":"Szegedy C et al. (2015). Going deeper with convolutions. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 1\u20139","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"9874_CR193","unstructured":"Tang Y (2013). Deep learning using support vector machines CoRR, abs\/13060239 2"},{"key":"9874_CR194","doi-asserted-by":"crossref","unstructured":"Tawari A, Trivedi MMJITom (2010). Speech emotion analysis: Exploring the role of context 12:502\u2013509","DOI":"10.1109\/TMM.2010.2058095"},{"key":"9874_CR195","doi-asserted-by":"crossref","first-page":"599","DOI":"10.1109\/TASSP.1980.1163453","volume":"28","author":"H Teager","year":"1980","unstructured":"Teager H (1980) Some observations on oral air flow during phonation IEEE transactions on acoustics. Speech, and Signal Processing 28:599\u2013601","journal-title":"Speech, and Signal Processing"},{"key":"9874_CR196","unstructured":"Teager HM, Teager SM (1983). A phenomenological model for vowel production in the vocal tract Speech Science: Recent Advances:73\u2013109"},{"key":"9874_CR197","unstructured":"Team TTD et al. (2016). Theano: A Python framework for fast computation of mathematical expressions arXiv preprint arXiv:160502688"},{"key":"9874_CR198","doi-asserted-by":"crossref","first-page":"75","DOI":"10.1007\/s13042-010-0004-x","volume":"1","author":"DL Tong","year":"2010","unstructured":"Tong DL, Mintram R (2010) Genetic algorithm-neural network (GANN): a study of neural network activation functions and depth of genetic algorithm search applied to feature selection. Int J Mach Learn Cybern 1:75\u201387","journal-title":"Int J Mach Learn Cybern"},{"key":"9874_CR199","doi-asserted-by":"crossref","unstructured":"Torres-Carrasquillo PA, Singer E, Kohler MA, Greene RJ, Reynolds DA, Deller JR (2002). Approaches to language identification using Gaussian mixture models and shifted delta cepstral features. In: Seventh international conference on spoken language processing","DOI":"10.1109\/ICASSP.2002.5743828"},{"key":"9874_CR200","doi-asserted-by":"crossref","first-page":"026216","DOI":"10.1103\/PhysRevE.63.026216","volume":"63","author":"MA Trevisan","year":"2001","unstructured":"Trevisan MA, Eguia MC, Mindlin GB (2001) Nonlinear aspects of analysis and synthesis of speech time series data. Phys Rev E 63:026216","journal-title":"Phys Rev E"},{"key":"9874_CR201","doi-asserted-by":"crossref","unstructured":"Vedaldi A, Lenc K (2015). Matconvnet: Convolutional neural networks for matlab. In: Proceedings of the 23rd ACM international conference on Multimedia. ACM, pp 689\u2013692","DOI":"10.1145\/2733373.2807412"},{"key":"9874_CR202","doi-asserted-by":"crossref","unstructured":"Ververidis D, Kotropoulos C (2005). Emotional speech classification using Gaussian mixture models and the sequential floating forward selection algorithm. In: 2005 IEEE International Conference on Multimedia and Expo. IEEE, pp 1500\u20131503","DOI":"10.1109\/ICME.2005.1521717"},{"key":"9874_CR203","doi-asserted-by":"crossref","unstructured":"Vincent P, Larochelle H, Bengio Y, Manzagol P-A (2008). Extracting and composing robust features with denoising autoencoders. In: Proceedings of the 25th international conference on Machine learning. ACM, pp 1096\u20131103","DOI":"10.1145\/1390156.1390294"},{"key":"9874_CR204","unstructured":"vlab.amrita.edu (2019) Non-stationary nature of speech signal. Amrita Vishwa Vidyapeetham http:\/\/vlabamritaedu\/?sub=3&brch=164&sim=371&cnt=1104 Accessed 17 October 2019 2019"},{"key":"9874_CR205","unstructured":"Wan L, Zeiler M, Zhang S, Le Cun Y, Fergus R (2013). Regularization of neural networks using dropconnect. In: International conference on machine learning. pp. 1058\u20131066"},{"key":"9874_CR206","doi-asserted-by":"crossref","unstructured":"Wei P, Zhao Y (2019). A novel speech emotion recognition algorithm based on wavelet kernel sparse classifier in stacked deep auto-encoder model Personal and Ubiquitous Computing:1\u20139","DOI":"10.1007\/s00779-019-01246-9"},{"key":"9874_CR207","doi-asserted-by":"crossref","unstructured":"Wen G, Li H, Huang J, Li D, Xun E (2017). Random deep belief networks for recognizing emotions from speech signals Comput Intell Neurosci 2017","DOI":"10.1155\/2017\/1945630"},{"key":"9874_CR208","first-page":"547","volume":"16","author":"F Weninger","year":"2015","unstructured":"Weninger F, Bergmann J, Schuller B (2015) Introducing currennt: The munich open-source cuda recurrent neural network toolkit. J Mach Learn Res 16:547\u2013551","journal-title":"J Mach Learn Res"},{"key":"9874_CR209","unstructured":"Weninger F, Ringeval F, Marchi E, Schuller BW Discriminatively trained recurrent neural networks for continuous dimensional emotion recognition from audio. In: IJCAI, 2016. pp. 2196\u20132202"},{"key":"9874_CR210","doi-asserted-by":"crossref","first-page":"1238","DOI":"10.1121\/1.1913238","volume":"52","author":"CE Williams","year":"1972","unstructured":"Williams CE, Stevens KN (1972) Emotions and speech: Some acoustical correlates. J Acoust Soc Am 52:1238\u20131250","journal-title":"J Acoust Soc Am"},{"key":"9874_CR211","doi-asserted-by":"crossref","unstructured":"W\u00f6llmer M, Metallinou A, Eyben F, Schuller B, Narayanan S (2010). Context-sensitive multimodal emotion recognition from speech and facial expression using bidirectional lstm modeling. In: Proc. INTERSPEECH 2010, Makuhari. pp. 2362\u20132365","DOI":"10.21437\/Interspeech.2010-646"},{"key":"9874_CR212","doi-asserted-by":"crossref","unstructured":"Wong E, Sridharan S (2001). Comparison of linear prediction cepstrum coefficients and mel-frequency cepstrum coefficients for language identification. In: Proceedings of 2001 International Symposium on Intelligent Multimedia, Video and Speech Processing. ISIMP 2001 (IEEE Cat. No. 01EX489). IEEE, pp 95\u201398","DOI":"10.1109\/ISIMP.2001.925340"},{"key":"9874_CR213","doi-asserted-by":"crossref","unstructured":"Xie Y, Liang R, Liang Z, Zhao L (2019). Attention-Based Dense LSTM for Speech Emotion Recognition IEICE TRANSACTIONS on Information and Systems 102:1426\u20131429","DOI":"10.1587\/transinf.2019EDL8019"},{"key":"9874_CR214","unstructured":"Yadav KS, Mukhedkar M (2013). Review on speech recognition International Journal of Science and Engineering 1:61\u201370"},{"key":"9874_CR215","doi-asserted-by":"crossref","unstructured":"Yeh J-H, Pao T-L, Lin C-Y, Tsai Y-W, Chen Y-T (2011). Segment-based emotion recognition from continuous Mandarin Chinese speech Computers in Human Behavior 27:1545\u20131552","DOI":"10.1016\/j.chb.2010.10.027"},{"key":"9874_CR216","doi-asserted-by":"crossref","unstructured":"Yu Z et al. (2015). Using bidirectional lstm recurrent neural networks to learn high-level abstractions of sequential features for automated scoring of non-native spontaneous speech. In: 2015 IEEE workshop on automatic speech recognition and understanding (ASRU). IEEE, pp 338\u2013345","DOI":"10.1109\/ASRU.2015.7404814"},{"key":"9874_CR217","doi-asserted-by":"crossref","unstructured":"Zaidan NA, Salam MS MFCC (2016). Global Features Selection in Improving Speech Emotion Recognition Rate. In, Cham. Advances in Machine Learning and Signal Processing. Springer International Publishing, pp 141\u2013153","DOI":"10.1007\/978-3-319-32213-1_13"},{"key":"9874_CR218","doi-asserted-by":"crossref","first-page":"300","DOI":"10.1109\/TAFFC.2016.2553038","volume":"8","author":"S Zhalehpour","year":"2016","unstructured":"Zhalehpour S, Onder O, Akhtar Z, Erdem CE (2016) BAUM-1: A spontaneous audio-visual face database of affective and mental states. IEEE Trans Affect Comput 8:300\u2013313","journal-title":"IEEE Trans Affect Comput"},{"key":"9874_CR219","doi-asserted-by":"crossref","unstructured":"Zhang W, Meng X, Lu Q, Rao Y, Zhou J A (2013). hybrid emotion recognition on android smart phones. In: 2013 IEEE International Conference on Green Computing and Communications and IEEE Internet of Things and IEEE Cyber, Physical and Social Computing. IEEE, pp 1313\u20131318","DOI":"10.1109\/GreenCom-iThings-CPSCom.2013.228"},{"key":"9874_CR220","doi-asserted-by":"crossref","unstructured":"Zhang T, Wu J (2015). Speech emotion recognition with i-vector feature and RNN model. In: 2015 IEEE China Summit and International Conference on Signal and Information Processing (ChinaSIP). IEEE, pp 524\u2013528","DOI":"10.1109\/ChinaSIP.2015.7230458"},{"key":"9874_CR221","doi-asserted-by":"crossref","unstructured":"Zhang S, Zhang S, Huang T, Gao W (2017a). Speech emotion recognition using deep convolutional neural network and discriminant temporal pyramid matching IEEE Transactions on Multimedia 20:1576\u20131590","DOI":"10.1109\/TMM.2017.2766843"},{"key":"9874_CR222","doi-asserted-by":"crossref","unstructured":"Zhang W, Zhao D, Chai Z, Yang LT, Liu X, Gong F, Yang S (2017b). Deep learning and SVM-based emotion recognition from Chinese speech for smart affective services Software: Practice and Experience 47:1127\u20131138","DOI":"10.1002\/spe.2487"},{"key":"9874_CR223","doi-asserted-by":"crossref","unstructured":"Zhang W, Zhao D, Chen X, Zhang Y (2016c). Deep Learning Based Emotion Recognition from Chinese Speech. In, Cham. Inclusive Smart Cities and Digital Health. Springer International Publishing, pp 49\u201358","DOI":"10.1007\/978-3-319-39601-9_5"},{"key":"9874_CR224","doi-asserted-by":"crossref","unstructured":"Zhang S, Zhao X, Chuang Y, Guo W, Chen Y (2016a). Feature Learning via Deep Belief Network for Chinese Speech Emotion Recognition. In, Singapore. Pattern recognition. Springer Singapore, pp 645\u2013651","DOI":"10.1007\/978-981-10-3005-5_53"},{"key":"9874_CR225","series-title":"Communications in Computer and Information Science","doi-asserted-by":"publisher","first-page":"645","DOI":"10.1007\/978-981-10-3005-5_53","volume-title":"Pattern Recognition","author":"SQ Zhang","year":"2016","unstructured":"Zhang SQ, Zhao XM, Chuang YL, Guo WP, Chen Y (2016b) Feature learning via deep belief network for Chinese speech emotion recognition. In: Tan T, Li X, Chen X, Zhou J, Yang J, Cheng H (eds) Pattern Recognition, Communications in Computer and Information Science, vol 663. Springer-Verlag Singapore Pte Ltd, Singapore, pp 645\u2013651. https:\/\/doi.org\/10.1007\/978-981-10-3005-5_53"},{"key":"9874_CR226","doi-asserted-by":"crossref","unstructured":"Zhao Z, Bao Z, Zhao Y, Zhang Z, Cummins N, Ren Z, Schuller B (2019b). Exploring deep spectrum representations via attention-based recurrent and convolutional neural networks for speech emotion recognition IEEE Access 7:97515\u201397525","DOI":"10.1109\/ACCESS.2019.2928625"},{"key":"9874_CR227","doi-asserted-by":"crossref","unstructured":"Zhao J, Mao X, Chen L (2019a). Speech emotion recognition using deep 1D & 2D CNN LSTM networks biomedical signal processing and control 47:312-323","DOI":"10.1016\/j.bspc.2018.08.035"},{"key":"9874_CR228","doi-asserted-by":"crossref","unstructured":"Zheng W, Yu J, Zou Y (2015). An experimental study of speech emotion recognition based on deep convolutional neural networks. In: 2015 international conference on affective computing and intelligent interaction (ACII). IEEE, pp 827\u2013831","DOI":"10.1109\/ACII.2015.7344669"},{"key":"9874_CR229","doi-asserted-by":"crossref","unstructured":"Zhu L, Chen L, Zhao D, Zhou J, Zhang W (2017a). Emotion recognition from Chinese speech for smart affective services using a combination of SVM and DBN Sensors 17:1694","DOI":"10.3390\/s17071694"},{"key":"9874_CR230","doi-asserted-by":"publisher","unstructured":"Zhu LZ, Chen LM, Zhao DH, Zhou JH, Zhang WS (2017b). Emotion Recognition from Chinese Speech for Smart Affective Services Using a Combination of SVM and DBN Sensors 17:14. https:\/\/doi.org\/10.3390\/s17071694","DOI":"10.3390\/s17071694"},{"key":"9874_CR231","doi-asserted-by":"publisher","unstructured":"Zou CR, Zhang XR, Zha C, Zhao L (2016). A novel DBN feature fusion model for cross-Corpus speech emotion recognition journal of electrical and computer engineering:11 https:\/\/doi.org\/10.1155\/2016\/7437860","DOI":"10.1155\/2016\/7437860"},{"key":"9874_CR232","doi-asserted-by":"crossref","first-page":"358","DOI":"10.1631\/FITEE.1400323","volume":"16","author":"H Z-w","year":"2015","unstructured":"Z-w H, Xue W-t, Mao Q-R (2015) Speech emotion recognition with unsupervised feature learning. Frontiers of Information Technology & Electronic Engineering 16:358\u2013366","journal-title":"Frontiers of Information Technology & Electronic Engineering"},{"key":"9874_CR233","unstructured":"Lykartsis A, Weinzierl S (2016). Rhythm Description for Music and Speech Using the Beat Histogram with Multiple Novelty Functions: First Results"}],"updated-by":[{"DOI":"10.1007\/s11042-021-10967-0","type":"correction","label":"Correction","source":"publisher","updated":{"date-parts":[[2021,5,1]],"date-time":"2021-05-01T00:00:00Z","timestamp":1619827200000}}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-020-09874-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-020-09874-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-020-09874-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,10,16]],"date-time":"2023-10-16T17:43:05Z","timestamp":1697478185000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-020-09874-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,1,2]]},"references-count":233,"journal-issue":{"issue":"16","published-print":{"date-parts":[[2021,7]]}},"alternative-id":["9874"],"URL":"https:\/\/doi.org\/10.1007\/s11042-020-09874-7","relation":{"correction":[{"id-type":"doi","id":"10.1007\/s11042-021-10967-0","asserted-by":"object"}]},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"value":"1380-7501","type":"print"},{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,1,2]]},"assertion":[{"value":"25 February 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 July 2020","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 September 2020","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 January 2021","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 May 2021","order":5,"name":"change_date","label":"Change Date","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Correction","order":6,"name":"change_type","label":"Change Type","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"A Correction to this paper has been published:","order":7,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"https:\/\/doi.org\/10.1007\/s11042-021-10967-0","URL":"https:\/\/doi.org\/10.1007\/s11042-021-10967-0","order":8,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}}]}}