{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T06:40:35Z","timestamp":1776753635945,"version":"3.51.2"},"reference-count":70,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2023,7,1]],"date-time":"2023-07-01T00:00:00Z","timestamp":1688169600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,7,1]],"date-time":"2023-07-01T00:00:00Z","timestamp":1688169600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Key Projects of the National Natural Science Foundation of China","award":["U1836220"],"award-info":[{"award-number":["U1836220"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62176106"],"award-info":[{"award-number":["62176106"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100013058","name":"Jiangsu Provincial Key Research and Development Program","doi-asserted-by":"publisher","award":["BE2020036"],"award-info":[{"award-number":["BE2020036"]}],"id":[{"id":"10.13039\/501100013058","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2023,7]]},"DOI":"10.1007\/s10772-023-10035-y","type":"journal-article","created":{"date-parts":[[2023,7,6]],"date-time":"2023-07-06T06:02:26Z","timestamp":1688623346000},"page":"541-557","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":29,"title":["An efficient speech emotion recognition based on a dual-stream CNN-transformer fusion network"],"prefix":"10.1007","volume":"26","author":[{"given":"Mohammed","family":"Tellai","sequence":"first","affiliation":[]},{"given":"Lijian","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Qirong","family":"Mao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,7,6]]},"reference":[{"key":"10035_CR1","first-page":"012036","volume":"930","author":"Y Afrillia","year":"2017","unstructured":"Afrillia, Y., Mawengkang, H., Ramli, M., Fhonna, R. P., et al. (2017). Performance measurement of mel frequency ceptral coefficient (MFCC) method in learning system of al-qur\u2019an based in nagham pattern recognition. Journal of Physics, 930, 012036.","journal-title":"Journal of Physics"},{"key":"10035_CR2","doi-asserted-by":"crossref","unstructured":"Aftab, A., Morsali, A., Ghaemmaghami, S., & Champagne, B. (2022). Light-sernet: A lightweight fully convolutional neural network for speech emotion recognition. In ICASSP 2022-2022 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 6912\u20136916). IEEE.","DOI":"10.1109\/ICASSP43922.2022.9746679"},{"issue":"2","key":"10035_CR3","doi-asserted-by":"crossref","first-page":"155","DOI":"10.1007\/s10462-012-9368-5","volume":"43","author":"C-N Anagnostopoulos","year":"2015","unstructured":"Anagnostopoulos, C.-N., Iliou, T., & Giannoukos, I. (2015). Features and classifiers for emotion recognition from speech: A survey from 2000 to 2011. Artificial Intelligence Review, 43(2), 155\u2013177.","journal-title":"Artificial Intelligence Review"},{"issue":"18","key":"10035_CR4","doi-asserted-by":"crossref","first-page":"5212","DOI":"10.3390\/s20185212","volume":"20","author":"T Anvarjon","year":"2020","unstructured":"Anvarjon, T., & Kwon, S. (2020). Deep-net: A lightweight CNN-based speech emotion recognition system using deep frequency features. Sensors, 20(18), 5212.","journal-title":"Sensors"},{"issue":"11","key":"10035_CR5","doi-asserted-by":"crossref","first-page":"21","DOI":"10.23915\/distill.00021","volume":"4","author":"A Araujo","year":"2019","unstructured":"Araujo, A., Norris, W., & Sim, J. (2019). Computing receptive fields of convolutional neural networks. Distill, 4(11), 21.","journal-title":"Distill"},{"issue":"4","key":"10035_CR6","first-page":"15","volume":"16","author":"G Assun\u00e7\u00e3o","year":"2020","unstructured":"Assun\u00e7\u00e3o, G., Menezes, P., & Perdig\u00e3o, F. (2020). Speaker awareness for speech emotion recognition. International Journal of Online and Biomedical Engineering, 16(4), 15\u201322.","journal-title":"International Journal of Online and Biomedical Engineering"},{"key":"10035_CR7","doi-asserted-by":"crossref","first-page":"108260","DOI":"10.1016\/j.apacoust.2021.108260","volume":"182","author":"O Atila","year":"2021","unstructured":"Atila, O., & \u015eeng\u00fcr, A. (2021). Attention guided 3d CNN-LSTM model for accurate speech based emotion recognition. Applied Acoustics, 182, 108260.","journal-title":"Applied Acoustics"},{"issue":"5","key":"10035_CR8","doi-asserted-by":"crossref","first-page":"5571","DOI":"10.1007\/s11042-017-5292-7","volume":"78","author":"AM Badshah","year":"2019","unstructured":"Badshah, A. M., Rahim, N., Ullah, N., Ahmad, J., Muhammad, K., Lee, M. Y., Kwon, S., & Baik, S. W. (2019). Deep features-based speech emotion recognition for smart affective services. Multimedia Tools and Applications, 78(5), 5571\u20135589.","journal-title":"Multimedia Tools and Applications"},{"key":"10035_CR9","doi-asserted-by":"crossref","first-page":"104886","DOI":"10.1016\/j.knosys.2019.104886","volume":"184","author":"A Bhavan","year":"2019","unstructured":"Bhavan, A., Chauhan, P., Shah, R. R., et al. (2019). Bagged support vector machines for emotion recognition from speech. Knowledge-Based Systems, 184, 104886.","journal-title":"Knowledge-Based Systems"},{"key":"10035_CR10","doi-asserted-by":"crossref","first-page":"103903","DOI":"10.1016\/j.engappai.2020.103903","volume":"95","author":"MC Bingol","year":"2020","unstructured":"Bingol, M. C., & Aydogmus, O. (2020). Performing predefined tasks using the human-robot interaction on speech recognition for an industrial robot. Engineering Applications of Artificial Intelligence, 95, 103903.","journal-title":"Engineering Applications of Artificial Intelligence"},{"key":"10035_CR11","doi-asserted-by":"crossref","first-page":"1517","DOI":"10.21437\/Interspeech.2005-446","volume":"5","author":"F Burkhardt","year":"2005","unstructured":"Burkhardt, F., Paeschke, A., Rolfes, M., Sendlmeier, W. F., Weiss, B., et al. (2005). A database of German emotional speech. Interspeech, 5, 1517\u20131520.","journal-title":"Interspeech"},{"issue":"10","key":"10035_CR12","doi-asserted-by":"crossref","first-page":"1440","DOI":"10.1109\/LSP.2018.2860246","volume":"25","author":"M Chen","year":"2018","unstructured":"Chen, M., He, X., Yang, J., & Zhang, H. (2018). 3-d convolutional recurrent neural networks with attention model for speech emotion recognition. IEEE Signal Processing Letters, 25(10), 1440\u20131444.","journal-title":"IEEE Signal Processing Letters"},{"key":"10035_CR13","first-page":"012003","volume":"2236","author":"RR Choudhary","year":"2022","unstructured":"Choudhary, R. R., Meena, G., & Mohbey, K. K. (2022). Speech emotion based sentiment recognition using deep neural networks. Journal of Physics, 2236, 012003.","journal-title":"Journal of Physics"},{"key":"10035_CR14","unstructured":"Ciresan, D. C., Meier, U., Masci, J., Gambardella, L. M., & Schmidhuber, J. (2011). Flexible, high performance convolutional neural networks for image classification. In Proceedings of the twenty-second international joint conference on artificial intelligence (IJCAI). AAAI Press."},{"key":"10035_CR15","doi-asserted-by":"crossref","first-page":"107360","DOI":"10.1016\/j.apacoust.2020.107360","volume":"166","author":"F Daneshfar","year":"2020","unstructured":"Daneshfar, F., Kabudian, S. J., & Neekabadi, A. (2020). Speech emotion recognition using hybrid spectral-prosodic features of speech signal\/glottal waveform, metaheuristic-based dimensionality reduction, and gaussian elliptical basis function network classifier. Applied Acoustics, 166, 107360.","journal-title":"Applied Acoustics"},{"key":"10035_CR16","unstructured":"Dupuis, K., & Pichora-Fuller, M. K. (2010). Toronto emotional speech set (TESS)-younger talker_happy."},{"issue":"3","key":"10035_CR17","first-page":"182","volume":"39","author":"K Dupuis","year":"2011","unstructured":"Dupuis, K., & Pichora-Fuller, M. K. (2011). Recognition of emotional speech for younger and older talkers: Behavioural findings from the Toronto emotional speech set. Canadian Acoustics, 39(3), 182\u2013183.","journal-title":"Canadian Acoustics"},{"issue":"3","key":"10035_CR18","doi-asserted-by":"crossref","first-page":"572","DOI":"10.1016\/j.patcog.2010.09.020","volume":"44","author":"M El Ayadi","year":"2011","unstructured":"El Ayadi, M., Kamel, M. S., & Karray, F. (2011). Survey on speech emotion recognition: Features, classification schemes, and databases. Pattern Recognition, 44(3), 572\u2013587.","journal-title":"Pattern Recognition"},{"issue":"1","key":"10035_CR19","doi-asserted-by":"crossref","first-page":"155","DOI":"10.1007\/s10772-020-09776-x","volume":"24","author":"M Gomathy","year":"2021","unstructured":"Gomathy, M. (2021). Optimal feature selection for speech emotion recognition using enhanced cat swarm optimization algorithm. International Journal of Speech Technology, 24(1), 155\u2013163.","journal-title":"International Journal of Speech Technology"},{"key":"10035_CR20","doi-asserted-by":"crossref","unstructured":"Gong, Y., Chung, Y.-A., & Glass, J. (2021). Ast: Audio spectrogram transformer. arXiv preprint arXiv:2104.01778.","DOI":"10.21437\/Interspeech.2021-698"},{"key":"10035_CR21","doi-asserted-by":"crossref","unstructured":"Gumelar, A. B., Yuniarno, E. M., Adi, D. P., Setiawan, R., Sugiarto, I., & Purnomo, M. H. (2022). Transformer-CNN automatic hyperparameter tuning for speech emotion recognition. In 2022 IEEE international conference on imaging systems and techniques (IST) (pp. 1\u20136). IEEE.","DOI":"10.1109\/IST55454.2022.9827732"},{"key":"10035_CR22","doi-asserted-by":"crossref","first-page":"75798","DOI":"10.1109\/ACCESS.2019.2921390","volume":"7","author":"L Guo","year":"2019","unstructured":"Guo, L., Wang, L., Dang, J., Liu, Z., & Guan, H. (2019). Exploration of complementary features for speech emotion recognition based on kernel extreme learning machine. IEEE Access, 7, 75798\u201375809.","journal-title":"IEEE Access"},{"key":"10035_CR24","doi-asserted-by":"crossref","unstructured":"Han, K., Yu, D., & Tashev, I. (2014). Speech emotion recognition using deep neural network and extreme learning machine. In Interspeech.","DOI":"10.21437\/Interspeech.2014-57"},{"key":"10035_CR23","doi-asserted-by":"crossref","unstructured":"Han, S., Leng, F., & Jin, Z. (2021). Speech emotion recognition with a resnet-CNN-transformer parallel neural network. In 2021 International conference on communications, information system and computer engineering (CISCE) (pp. 803\u2013807). IEEE.","DOI":"10.1109\/CISCE52179.2021.9445906"},{"key":"10035_CR25","unstructured":"Huang, A., & Bao, P. (2019). Human vocal sentiment analysis. arXiv preprint arXiv:1905.08632."},{"key":"10035_CR26","doi-asserted-by":"crossref","unstructured":"Huang, Z., Dong, M., Mao, Q., & Zhan, Y. (2014). Speech emotion recognition using CNN. In Proceedings of the 22nd ACM international conference on multimedia (pp. 801\u2013804).","DOI":"10.1145\/2647868.2654984"},{"issue":"5","key":"10035_CR27","doi-asserted-by":"crossref","first-page":"358","DOI":"10.1631\/FITEE.1400323","volume":"16","author":"Z-W Huang","year":"2015","unstructured":"Huang, Z.-W., Xue, W.-T., & Mao, Q.-R. (2015). Speech emotion recognition with unsupervised feature learning. Frontiers of Information Technology & Electronic Engineering, 16(5), 358\u2013366.","journal-title":"Frontiers of Information Technology & Electronic Engineering"},{"issue":"4","key":"10035_CR28","first-page":"275","volume":"27","author":"A Ismail","year":"2014","unstructured":"Ismail, A., Idris, M. Y. I., Noor, N. M., Razak, Z., & Yusoff, Z. M. (2014). MFCC-VQ approach for qalqalahtajweed rule checking. Malaysian Journal of Computer Science, 27(4), 275\u2013293.","journal-title":"Malaysian Journal of Computer Science"},{"key":"10035_CR29","doi-asserted-by":"crossref","first-page":"101894","DOI":"10.1016\/j.bspc.2020.101894","volume":"59","author":"D Issa","year":"2020","unstructured":"Issa, D., Demirci, M. F., & Yazici, A. (2020). Speech emotion recognition with deep convolutional neural networks. Biomedical Signal Processing and Control, 59, 101894.","journal-title":"Biomedical Signal Processing and Control"},{"key":"10035_CR30","doi-asserted-by":"crossref","unstructured":"Jalal, M. A., Loweimi, E., Moore, R. K., & Hain, T. (2019). Learning temporal clusters using capsule routing for speech emotion recognition. In Proceedings of interspeech 2019 (pp. 1701\u20131705). ISCA.","DOI":"10.21437\/Interspeech.2019-3068"},{"key":"10035_CR31","first-page":"68","volume":"67","author":"CA Jason","year":"2020","unstructured":"Jason, C. A., Kumar, S., et al. (2020). An appraisal on speech and emotion recognition technologies based on machine learning. Language, 67, 68.","journal-title":"Language"},{"key":"10035_CR32","doi-asserted-by":"crossref","first-page":"90368","DOI":"10.1109\/ACCESS.2019.2927384","volume":"7","author":"P Jiang","year":"2019","unstructured":"Jiang, P., Fu, H., Tao, H., Lei, P., & Zhao, L. (2019). Parallelized convolutional recurrent neural network with spectral features for speech emotion recognition. IEEE Access, 7, 90368\u201390377.","journal-title":"IEEE Access"},{"key":"10035_CR33","doi-asserted-by":"crossref","first-page":"67718","DOI":"10.1109\/ACCESS.2019.2916828","volume":"7","author":"F Karim","year":"2019","unstructured":"Karim, F., Majumdar, S., & Darabi, H. (2019). Insights into LSTM fully convolutional networks for time series classification. IEEE Access, 7, 67718\u201367725.","journal-title":"IEEE Access"},{"issue":"2","key":"10035_CR34","doi-asserted-by":"crossref","first-page":"303","DOI":"10.1007\/s10772-020-09792-x","volume":"24","author":"U Kumaran","year":"2021","unstructured":"Kumaran, U., Radha Rammohan, S., Nagarajan, S. M., & Prathik, A. (2021). Fusion of mel and gammatone frequency cepstral coefficients for speech emotion recognition using deep c-rnn. International Journal of Speech Technology, 24(2), 303\u2013314.","journal-title":"International Journal of Speech Technology"},{"issue":"1","key":"10035_CR35","doi-asserted-by":"crossref","first-page":"183","DOI":"10.3390\/s20010183","volume":"20","author":"S Kwon","year":"2019","unstructured":"Kwon, S. (2019). A CNN-assisted enhanced audio signal processing for speech emotion recognition. Sensors, 20(1), 183.","journal-title":"Sensors"},{"issue":"9","key":"10035_CR37","doi-asserted-by":"crossref","first-page":"5116","DOI":"10.1002\/int.22505","volume":"36","author":"S Kwon","year":"2021","unstructured":"Kwon, S. (2021). Optimal feature selection based speech emotion recognition using two-stream deep convolutional neural network. International Journal of Intelligent Systems, 36(9), 5116\u20135135.","journal-title":"International Journal of Intelligent Systems"},{"key":"10035_CR36","doi-asserted-by":"crossref","first-page":"114177","DOI":"10.1016\/j.eswa.2020.114177","volume":"167","author":"S Kwon","year":"2021","unstructured":"Kwon, S. (2021). MLT-Dnet: Speech emotion recognition using 1d dilated CNN based on multi-learning trick approach. Expert Systems with Applications, 167, 114177.","journal-title":"Expert Systems with Applications"},{"issue":"7553","key":"10035_CR38","doi-asserted-by":"crossref","first-page":"436","DOI":"10.1038\/nature14539","volume":"521","author":"Y LeCun","year":"2015","unstructured":"LeCun, Y., Bengio, Y., & Hinton, G. (2015). Deep learning. Nature, 521(7553), 436\u2013444.","journal-title":"Nature"},{"key":"10035_CR39","doi-asserted-by":"crossref","unstructured":"Lee, J., & Tashev, I. (2015). High-level feature representation using recurrent neural network for speech emotion recognition. In Interspeech.","DOI":"10.21437\/Interspeech.2015-336"},{"key":"10035_CR40","doi-asserted-by":"crossref","unstructured":"Li, Y., Zhao, T., & Kawahara, T. (2019). Improved end-to-end speech emotion recognition using self attention mechanism and multitask learning. In Interspeech (pp. 2803\u20132807).","DOI":"10.21437\/Interspeech.2019-2594"},{"issue":"2","key":"10035_CR41","doi-asserted-by":"crossref","first-page":"391","DOI":"10.1007\/s10772-021-09955-4","volume":"25","author":"M Liu","year":"2022","unstructured":"Liu, M. (2022). English speech emotion recognition method based on speech recognition. International Journal of Speech Technology, 25(2), 391\u2013398.","journal-title":"International Journal of Speech Technology"},{"issue":"5","key":"10035_CR42","doi-asserted-by":"crossref","first-page":"0196391","DOI":"10.1371\/journal.pone.0196391","volume":"13","author":"SR Livingstone","year":"2018","unstructured":"Livingstone, S. R., & Russo, F. A. (2018). The Ryerson audio-visual database of emotional speech and song (RAVDESS): A dynamic, multimodal set of facial and vocal expressions in North American English. PLoS ONE, 13(5), 0196391.","journal-title":"PLoS ONE"},{"key":"10035_CR43","doi-asserted-by":"crossref","first-page":"238","DOI":"10.1016\/j.neucom.2021.02.094","volume":"448","author":"S Li","year":"2021","unstructured":"Li, S., Xing, X., Fan, W., Cai, B., Fordson, P., & Xu, X. (2021). Spatiotemporal and frequential cascaded attention networks for speech emotion recognition. Neurocomputing, 448, 238\u2013248.","journal-title":"Neurocomputing"},{"key":"10035_CR44","unstructured":"Luo, W., Li, Y., Urtasun, R., & Zemel, R. (2016). Understanding the effective receptive field in deep convolutional neural networks. Advances in Neural Information Processing Systems, 29."},{"issue":"8","key":"10035_CR45","doi-asserted-by":"crossref","first-page":"2203","DOI":"10.1109\/TMM.2014.2360798","volume":"16","author":"Q Mao","year":"2014","unstructured":"Mao, Q., Dong, M., Huang, Z., & Zhan, Y. (2014). Learning salient features for speech emotion recognition using convolutional neural networks. IEEE Transactions on Multimedia, 16(8), 2203\u20132213.","journal-title":"IEEE Transactions on Multimedia"},{"key":"10035_CR46","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/j.specom.2017.06.006","volume":"93","author":"Q Mao","year":"2017","unstructured":"Mao, Q., Xu, G., Xue, W., Gou, J., & Zhan, Y. (2017). Learning emotion-discriminative and domain-invariant features for domain adaptation in speech emotion recognition. Speech Communication, 93, 1\u201310.","journal-title":"Speech Communication"},{"key":"10035_CR47","doi-asserted-by":"crossref","unstructured":"McFee, B., Raffel, C., Liang, D., Ellis, D. P., McVicar, M., Battenberg, E., & Nieto, O. (2015). Librosa: Audio and music signal analysis in Python. In Proceedings of the 14th Python in science conference (Vol. 8, pp. 18\u201325). Citeseer.","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"10035_CR48","doi-asserted-by":"crossref","first-page":"125868","DOI":"10.1109\/ACCESS.2019.2938007","volume":"7","author":"H Meng","year":"2019","unstructured":"Meng, H., Yan, T., Yuan, F., & Wei, H. (2019). Speech emotion recognition from 3D Log-Mel spectrograms with deep learning network. IEEE Access, 7, 125868\u2013125881.","journal-title":"IEEE Access"},{"key":"10035_CR49","doi-asserted-by":"crossref","unstructured":"Mirsamadi, S., Barsoum, E., & Zhang, C. (2017). Automatic speech emotion recognition using recurrent neural networks with local attention. In 2017 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 2227\u20132231). IEEE.","DOI":"10.1109\/ICASSP.2017.7952552"},{"issue":"2","key":"10035_CR50","doi-asserted-by":"crossref","first-page":"98","DOI":"10.1016\/j.specom.2006.11.004","volume":"49","author":"D Morrison","year":"2007","unstructured":"Morrison, D., Wang, R., & De Silva, L. C. (2007). Ensemble methods for spoken emotion recognition in call-centres. Speech Communication, 49(2), 98\u2013112.","journal-title":"Speech Communication"},{"issue":"3","key":"10035_CR51","doi-asserted-by":"crossref","first-page":"4039","DOI":"10.32604\/cmc.2021.015070","volume":"67","author":"KS Mustaqeem","year":"2021","unstructured":"Mustaqeem, K. S. (2021). 1d-CNN: Speech emotion recognition system using a stacked network with dilated CNN features. CMC-Computers Materials & Continua, 67(3), 4039\u20134059.","journal-title":"CMC-Computers Materials & Continua"},{"issue":"3","key":"10035_CR52","doi-asserted-by":"crossref","first-page":"587","DOI":"10.3390\/rs12030587","volume":"12","author":"RA Naqvi","year":"2020","unstructured":"Naqvi, R. A., Arsalan, M., Rehman, A., Rehman, A. U., Loh, W.-K., & Paul, A. (2020). Deep learning-based drivers emotion classification system in time series data for remote applications. Remote Sensing, 12(3), 587.","journal-title":"Remote Sensing"},{"key":"10035_CR53","first-page":"8026","volume":"32","author":"A Paszke","year":"2019","unstructured":"Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et al. (2019). Pytorch: An imperative style, high-performance deep learning library. Advances in Neural Information Processing Systems, 32, 8026\u20138037.","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"11","key":"10035_CR54","doi-asserted-by":"crossref","first-page":"1577","DOI":"10.3844\/jcssp.2018.1577.1587","volume":"14","author":"V Praseetha","year":"2018","unstructured":"Praseetha, V., & Vadivel, S. (2018). Deep learning models for speech emotion recognition. Journal of Computer Science, 14(11), 1577\u20131587.","journal-title":"Journal of Computer Science"},{"key":"10035_CR55","doi-asserted-by":"crossref","unstructured":"Rahaman, M. E., Alam, S. S., Mondal, H. S., Muntaseer, A.S., Mandal, R., & Raihan, M. (2019). Performance analysis of isolated speech recognition technique using MFCC and cross-correlation. In 2019 10th international conference on computing, communication and networking technologies (ICCCNT) (pp. 1\u20134). IEEE.","DOI":"10.1109\/ICCCNT45670.2019.8944534"},{"key":"10035_CR56","doi-asserted-by":"crossref","first-page":"79861","DOI":"10.1109\/ACCESS.2020.2990405","volume":"8","author":"M Sajjad","year":"2020","unstructured":"Sajjad, M., Kwon, S., et al. (2020). Clustering-based speech emotion recognition by incorporating learned features and deep BiLSTM. IEEE Access, 8, 79861\u201379875.","journal-title":"IEEE Access"},{"key":"10035_CR57","doi-asserted-by":"crossref","unstructured":"Satt, A., Rozenberg, S., & Hoory, R. (2017). Efficient emotion recognition from speech using deep learning on spectrograms. In Interspeech (pp. 1089\u20131093).","DOI":"10.21437\/Interspeech.2017-200"},{"issue":"2","key":"10035_CR58","doi-asserted-by":"crossref","first-page":"119","DOI":"10.1109\/T-AFFC.2010.8","volume":"1","author":"B Schuller","year":"2010","unstructured":"Schuller, B., Vlasenko, B., Eyben, F., W\u00f6llmer, M., Stuhlsatz, A., Wendemuth, A., & Rigoll, G. (2010). Cross-corpus acoustic emotion recognition: Variances and strategies. IEEE Transactions on Affective Computing, 1(2), 119\u2013131.","journal-title":"IEEE Transactions on Affective Computing"},{"key":"10035_CR59","doi-asserted-by":"crossref","unstructured":"Singh, Y. B., & Goel, S. (2022). A systematic literature review of speech emotion recognition approaches. Neurocomputing.","DOI":"10.1016\/j.neucom.2022.04.028"},{"key":"10035_CR60","doi-asserted-by":"crossref","unstructured":"Stuhlsatz, A., Meyer, C., Eyben, F., Zielke, T., Meier, G., & Schuller, B. (2011). Deep neural networks for acoustic emotion recognition: Raising the benchmarks. In 2011 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 5688\u20135691). IEEE","DOI":"10.1109\/ICASSP.2011.5947651"},{"key":"10035_CR61","doi-asserted-by":"crossref","unstructured":"Tao, F., & Liu, G. (2018). Advanced LSTM: A study about better time dependency modeling in emotion recognition. In 2018 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 2906\u20132910). IEEE.","DOI":"10.1109\/ICASSP.2018.8461750"},{"key":"10035_CR62","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, \u0141., & Polosukhin, I. (2017). Attention is all you need. Advances in Neural Information Processing Systems, 30."},{"key":"10035_CR63","doi-asserted-by":"crossref","first-page":"55","DOI":"10.1016\/j.patcog.2018.12.026","volume":"89","author":"H Wang","year":"2019","unstructured":"Wang, H., Zhang, Q., Wu, J., Pan, S., & Chen, Y. (2019). Time series feature learning with labeled and unlabeled data. Pattern Recognition, 89, 55\u201366.","journal-title":"Pattern Recognition"},{"issue":"7","key":"10035_CR64","doi-asserted-by":"crossref","first-page":"1436","DOI":"10.1109\/TASLP.2017.2694704","volume":"25","author":"X Xu","year":"2017","unstructured":"Xu, X., Deng, J., Cummins, N., Zhang, Z., Wu, C., Zhao, L., & Schuller, B. (2017). A two-dimensional framework of multiple kernel subspace learning for recognizing emotion in speech. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 25(7), 1436\u20131449.","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10035_CR65","doi-asserted-by":"crossref","unstructured":"Zamil, A. A. A., Hasan, S., Baki, S. M. J., Adam, J. M., & Zaman, I. (2019). Emotion detection from speech signals using voting mechanism on classified frames. In 2019 International conference on robotics, electrical and signal processing techniques (ICREST) (pp. 281\u2013285). IEEE.","DOI":"10.1109\/ICREST.2019.8644168"},{"key":"10035_CR66","doi-asserted-by":"crossref","unstructured":"Zayene, B., Jlassi, C., & Arous, N. (2020). 3D convolutional recurrent global neural network for speech emotion recognition. In 2020 5th International conference on advanced technologies for signal and image processing (ATSIP) (pp. 1\u20135). IEEE.","DOI":"10.1109\/ATSIP49331.2020.9231597"},{"key":"10035_CR68","doi-asserted-by":"crossref","first-page":"10767","DOI":"10.1109\/ACCESS.2019.2891838","volume":"7","author":"M Zeng","year":"2019","unstructured":"Zeng, M., & Xiao, N. (2019). Effective combination of densenet and BiLSTM for keyword spotting. IEEE Access, 7, 10767\u201310775.","journal-title":"IEEE Access"},{"issue":"3","key":"10035_CR67","doi-asserted-by":"crossref","first-page":"3705","DOI":"10.1007\/s11042-017-5539-3","volume":"78","author":"Y Zeng","year":"2019","unstructured":"Zeng, Y., Mao, H., Peng, D., & Yi, Z. (2019). Spectrogram based multi-task audio classification. Multimedia Tools and Applications, 78(3), 3705\u20133722.","journal-title":"Multimedia Tools and Applications"},{"issue":"6","key":"10035_CR69","doi-asserted-by":"crossref","first-page":"1576","DOI":"10.1109\/TMM.2017.2766843","volume":"20","author":"S Zhang","year":"2017","unstructured":"Zhang, S., Zhang, S., Huang, T., & Gao, W. (2017). Speech emotion recognition using deep convolutional neural network and discriminant temporal pyramid matching. IEEE Transactions on Multimedia, 20(6), 1576\u20131590.","journal-title":"IEEE Transactions on Multimedia"},{"key":"10035_CR70","doi-asserted-by":"crossref","first-page":"312","DOI":"10.1016\/j.bspc.2018.08.035","volume":"47","author":"J Zhao","year":"2019","unstructured":"Zhao, J., Mao, X., & Chen, L. (2019). Speech emotion recognition using deep 1d & 2d CNN LSTM networks. Biomedical Signal Processing and Control, 47, 312\u2013323.","journal-title":"Biomedical Signal Processing and Control"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-023-10035-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10772-023-10035-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-023-10035-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,7,31]],"date-time":"2023-07-31T11:18:12Z","timestamp":1690802292000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10772-023-10035-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,7]]},"references-count":70,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2023,7]]}},"alternative-id":["10035"],"URL":"https:\/\/doi.org\/10.1007\/s10772-023-10035-y","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"value":"1381-2416","type":"print"},{"value":"1572-8110","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,7]]},"assertion":[{"value":"12 December 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 May 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 July 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}