{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,13]],"date-time":"2025-09-13T16:24:08Z","timestamp":1757780648068,"version":"3.37.3"},"reference-count":44,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1007\/s10772-023-10079-0","type":"journal-article","created":{"date-parts":[[2023,12,28]],"date-time":"2023-12-28T12:02:26Z","timestamp":1703764946000},"page":"1165-1181","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Deep neural network architectures for audio emotion recognition performed on song and speech modalities"],"prefix":"10.1007","volume":"26","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6389-8933","authenticated-orcid":false,"given":"Souha","family":"Ayadi","sequence":"first","affiliation":[]},{"given":"Zied","family":"Lachiri","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,12,28]]},"reference":[{"key":"10079_CR1","unstructured":"Agarap, A. F. (2018). Deep learning using rectified linear units (relu). CoRR, arXiv:abs\/1803.08375."},{"issue":"2","key":"10079_CR2","first-page":"285","volume":"33","author":"K Aghajani","year":"2020","unstructured":"Aghajani, K., Esmaili, I., & Afrakoti, P. (2020). Speech emotion recognition using scalogram based deep structure. International Journal of Engineering, 33(2), 285\u2013292.","journal-title":"International Journal of Engineering"},{"issue":"3","key":"10079_CR3","doi-asserted-by":"publisher","first-page":"469","DOI":"10.1080\/00401706.1971.10488811","volume":"13","author":"DM Allen","year":"1971","unstructured":"Allen, D. M. (1971). Mean square error of prediction as a criterion for selecting variables. Technometrics, 13(3), 469\u2013475.","journal-title":"Technometrics"},{"key":"10079_CR4","unstructured":"Anand, N. & Verma, P. (2015). Convoluted feelings convolutional and recurrent nets for detecting emotion from audio data. In Technical Report. Stanford University."},{"issue":"18","key":"10079_CR5","doi-asserted-by":"publisher","first-page":"5212","DOI":"10.3390\/s20185212","volume":"20","author":"T Anvarjon","year":"2020","unstructured":"Anvarjon, T., Kwon, S., et al. (2020). Deep-net: A lightweight cnn-based speech emotion recognition system using deep frequency features. Sensors, 20(18), 5212.","journal-title":"Sensors"},{"key":"10079_CR6","doi-asserted-by":"crossref","unstructured":"Ayadi, S., & Lachiri, Z. (2022a). A combined cnn-lstm network for audio emotion recognition using speech and song attributs. In 2022 6th International conference on advanced technologies for signal and image processing (ATSIP) (pp. 1\u20136). IEEE.","DOI":"10.1109\/ATSIP55956.2022.9805924"},{"key":"10079_CR7","doi-asserted-by":"crossref","unstructured":"Ayadi, S., & Lachiri, Z. (2022b). Deep neural network for visual emotion recognition based on resnet50 using song-speech characteristics. In 2022 5th international conference on advanced systems and emergent technologies (IC_ASET) (pp. 363\u2013368). IEEE.","DOI":"10.1109\/IC_ASET53395.2022.9765898"},{"key":"10079_CR8","doi-asserted-by":"crossref","unstructured":"Ayadi, S., & Lachiri, Z. (2022c). Multiple neural network architectures for visual emotion recognition using song-speech modality. In 2022 IEEE information technologies & smart industrial systems (ITSIS) (pp. 1\u20136). IEEE.","DOI":"10.1109\/ITSIS56166.2022.10118423"},{"key":"10079_CR9","first-page":"98","volume":"3","author":"S Ayadi","year":"2022","unstructured":"Ayadi, S., & Lachiri, Z. (2022d). Visual emotion sensing using convolutional neural network. Przeglad Elektrotechniczny, 3, 98.","journal-title":"Przeglad Elektrotechniczny"},{"key":"10079_CR10","doi-asserted-by":"crossref","unstructured":"Cho, J., Pappagari, R., Kulkarni, P., Villalba, J., Carmiel, Y., & Dehak, N. (2019). Deep neural networks for emotion recognition combining audio and transcripts. arXiv preprint arXiv:1911.00432.","DOI":"10.21437\/Interspeech.2018-2466"},{"issue":"21","key":"10079_CR11","doi-asserted-by":"publisher","first-page":"6008","DOI":"10.3390\/s20216008","volume":"20","author":"M Farooq","year":"2020","unstructured":"Farooq, M., Hussain, F., Baloch, N. K., Raja, F. R., Heejung, Y., & Zikria, Yousaf Bin. (2020). Impact of feature selection algorithm on speech emotion recognition using deep convolutional neural network. Sensors, 20(21), 6008.","journal-title":"Sensors"},{"key":"10079_CR12","doi-asserted-by":"crossref","unstructured":"Garg, U., Agarwal, S., Gupta, S., Dutt, R., & Singh, D. (2020). Prediction of emotions from the audio speech signals using mfcc, mel and chroma. In 2020 12th international conference on computational intelligence and communication networks (CICN) (pp. 87\u201391). IEEE.","DOI":"10.1109\/CICN49253.2020.9242635"},{"issue":"5","key":"10079_CR13","doi-asserted-by":"publisher","first-page":"855","DOI":"10.1109\/TPAMI.2008.137","volume":"31","author":"A Graves","year":"2008","unstructured":"Graves, A., Liwicki, M., Fernandez, S., Bertolami, R., Bunke, H., & Schmidhuber, J. (2008). A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence, 31(5), 855\u2013868.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10079_CR14","doi-asserted-by":"crossref","unstructured":"Han, K., Yu, D., & Tashev, I. (2014). Speech emotion recognition using deep neural network and extreme learning machine. In Interspeech 2014.","DOI":"10.21437\/Interspeech.2014-57"},{"key":"10079_CR15","doi-asserted-by":"crossref","unstructured":"He, G., Liu, X., Fan, F., & You, J. (2020). Image2audio: Facilitating semi-supervised audio emotion recognition with facial expression image. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition workshops. (pp. 912\u2013913).","DOI":"10.1109\/CVPRW50498.2020.00464"},{"issue":"8","key":"10079_CR16","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural Computation, 9(8), 1735\u20131780.","journal-title":"Neural Computation"},{"issue":"3","key":"10079_CR17","doi-asserted-by":"publisher","first-page":"1323","DOI":"10.1007\/s10044-019-00860-w","volume":"23","author":"S Hwang","year":"2020","unstructured":"Hwang, S., Hong, K., Son, G., & Byun, H. (2020). Learning cnn features from de features for eeg-based emotion recognition. Pattern Analysis and Applications, 23(3), 1323\u20131335.","journal-title":"Pattern Analysis and Applications"},{"key":"10079_CR18","unstructured":"Ioffe, S., & Szegedy, C. (2015). Batch normalization: Accelerating deep network training by reducing internal covariate shift. CoRR, arXiv:abs\/1502.03167."},{"key":"10079_CR19","doi-asserted-by":"publisher","DOI":"10.1016\/j.bspc.2020.101894","volume":"59","author":"D Issa","year":"2020","unstructured":"Issa, D., Demirci, M. F., & Yazici, A. (2020). Speech emotion recognition with deep convolutional neural networks. Biomedical Signal Processing and Control, 59, 101894.","journal-title":"Biomedical Signal Processing and Control"},{"key":"10079_CR20","doi-asserted-by":"crossref","unstructured":"Jacovi, A., Shalom, O. S., & Goldberg, Y. (2018). Understanding convolutional neural networks for text classification. CoRR, arXiv:abs\/1809.08037.","DOI":"10.18653\/v1\/W18-5408"},{"issue":"2","key":"10079_CR21","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s10772-020-09792-x","volume":"24","author":"U Kumaran","year":"2021","unstructured":"Kumaran, U., Radha Rammohan, S., Murugan Nagarajan, S., & Prathik, A. (2021). Fusion of mel and gammatone frequency cepstral coefficients for speech emotion recognition using deep c-rnn. International Journal of Speech Technology, 24(2), 303\u2013314.","journal-title":"International Journal of Speech Technology"},{"issue":"12","key":"10079_CR22","doi-asserted-by":"publisher","first-page":"2133","DOI":"10.3390\/math8122133","volume":"8","author":"S Kwon","year":"2020","unstructured":"Kwon, S., et al. (2020). Clstm: Deep feature-based speech emotion recognition using the hierarchical convlstm network. Mathematics, 8(12), 2133.","journal-title":"Mathematics"},{"issue":"1","key":"10079_CR23","first-page":"183","volume":"20","author":"S Kwon","year":"2020","unstructured":"Kwon, S., et al. (2020). A cnn-assisted enhanced audio signal processing for speech emotion recognition. Sensors, 20(1), 183.","journal-title":"Sensors"},{"key":"10079_CR24","doi-asserted-by":"crossref","unstructured":"Lee, J., & Tashev, I. (2015). High-level feature representation using recurrent neural network for speech emotion recognition. In Sixteenth annual conference of the international speech communication association.","DOI":"10.21437\/Interspeech.2015-336"},{"key":"10079_CR25","unstructured":"Lee, J., Kim, T., Park, J., & Nam, J. (2017). Raw waveform-based audio classification using sample-level cnn architectures. arXiv preprint arXiv:1712.00866."},{"key":"10079_CR26","unstructured":"Li, T., & Ogihara, M. (2003). Detecting emotion in music."},{"key":"10079_CR27","doi-asserted-by":"crossref","unstructured":"Li, X., & Wu, X. (2014). Constructing long short-term memory based deep recurrent neural networks for large vocabulary speech recognition. CoRR, arXiv:abs\/1410.4281.","DOI":"10.1109\/ICASSP.2015.7178826"},{"key":"10079_CR28","doi-asserted-by":"crossref","unstructured":"Li, X., & Wu, X. (2015). Constructing long short-term memory based deep recurrent neural networks for large vocabulary speech recognition.","DOI":"10.1109\/ICASSP.2015.7178826"},{"issue":"6","key":"10079_CR29","doi-asserted-by":"publisher","first-page":"913","DOI":"10.1007\/s12652-016-0406-z","volume":"8","author":"Y Li","year":"2017","unstructured":"Li, Y., Tao, J., Chao, L., Bao, W., & Liu, Y. (2017). Cheavd: A chinese natural emotional audio-visual database. Journal of Ambient Intelligence and Humanized Computing, 8(6), 913\u2013924.","journal-title":"Journal of Ambient Intelligence and Humanized Computing"},{"key":"10079_CR30","unstructured":"Lipton, Z., Kale, D., Elkan, C., & Wetzel, R. (2015). Learning to diagnose with lstm recurrent neural networks. 11."},{"key":"10079_CR31","doi-asserted-by":"crossref","unstructured":"Liu, T., Han, L., Ma, L., & Guo, D.(2018). Audio-based deep music emotion recognition. In AIP conference proceedings (Vol. 1967, pp. 040021). AIP Publishing LLC.","DOI":"10.1063\/1.5039095"},{"key":"10079_CR32","unstructured":"Liu, X., Chen, Q., Wu, X., Liu, Y., & Liu, Y.. (2017). Cnn based music emotion classification. arXiv preprint arXiv:1704.05665."},{"key":"10079_CR33","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1371\/journal.pone.0196391","volume":"13","author":"SR Livingstone","year":"2018","unstructured":"Livingstone, S. R., & Russo, F. A. (2018). The Ryerson audio-visual database of emotional speech and song (Ravdess): A dynamic, multimodal set of facial and vocal expressions in North American English. PLoS ONE, 13, 1\u201335.","journal-title":"PLoS ONE"},{"key":"10079_CR34","unstructured":"Pedregosa, F., Varoquaux, G., Gramfort, A., Michel, V., Thirion, B., Grisel, O., Blondel, M., Prettenhofer, P., Weiss, R., Dubourg, V., VanderPlas, J., Passos, A., Cournapeau, D., Brucher, M., Perrot, M., & Duchesnay, E. (2012). Scikit-learn: Machine learning in python. CoRR, abs\/1201.0490."},{"issue":"9","key":"10079_CR35","doi-asserted-by":"publisher","first-page":"1215","DOI":"10.1109\/5.237532","volume":"81","author":"JW Picone","year":"1993","unstructured":"Picone, J. W. (1993). Signal modeling techniques in speech recognition. Proceedings of the IEEE, 81(9), 1215\u20131247.","journal-title":"Proceedings of the IEEE"},{"issue":"4","key":"10079_CR36","doi-asserted-by":"publisher","first-page":"543","DOI":"10.1016\/j.specom.2011.11.004","volume":"54","author":"M Sahidullah","year":"2012","unstructured":"Sahidullah, M., & Saha, G. (2012). Design, analysis and experimental evaluation of block based transformation in mfcc computation for speaker recognition. Speech Communication, 54(4), 543\u2013565.","journal-title":"Speech Communication"},{"issue":"19","key":"10079_CR37","doi-asserted-by":"publisher","first-page":"5559","DOI":"10.3390\/s20195559","volume":"20","author":"M Seo","year":"2020","unstructured":"Seo, M., & Kim, M. (2020). Fusing visual attention cnn and bag of visual words for cross-corpus speech emotion recognition. Sensors, 20(19), 5559.","journal-title":"Sensors"},{"key":"10079_CR38","unstructured":"Simonyan, K., & Zisserman, A. (2014). Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556"},{"issue":"56","key":"10079_CR39","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I., & Salakhutdinov, R. (2014). Dropout: A simple way to prevent neural networks from overfitting. Journal of Machine Learning Research, 15(56), 1929\u20131958.","journal-title":"Journal of Machine Learning Research"},{"key":"10079_CR40","doi-asserted-by":"crossref","unstructured":"Wu, B., Zhong, E., Horner, A., & Yang, Q. (2014). Music emotion recognition by multi-label multi-layer multi-instance multi-view learning. In Proceedings of the 22nd ACM international conference on multimedia. (pp. 117\u2013126).","DOI":"10.1145\/2647868.2654904"},{"issue":"11","key":"10079_CR41","doi-asserted-by":"publisher","first-page":"1675","DOI":"10.1109\/TASLP.2019.2925934","volume":"27","author":"Y Xie","year":"2019","unstructured":"Xie, Y., Liang, R., Liang, Z., Huang, C., Zou, C., & Schuller, B. (2019). Speech emotion classification using attention-based lstm. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 27(11), 1675\u20131685.","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10079_CR42","doi-asserted-by":"crossref","unstructured":"Yenter, A., & Verma, A. (2017). Deep cnn-lstm with combined kernels from multiple branches for imdb review sentiment analysis. In 2017 IEEE 8th annual ubiquitous computing, electronics and mobile communication conference (UEMCON) (pp. 540\u2013546).","DOI":"10.1109\/UEMCON.2017.8249013"},{"key":"10079_CR43","doi-asserted-by":"crossref","unstructured":"Yoon, S., Byun, S., & Jung, K. (2018). Multimodal speech emotion recognition using audio and text. In 2018 IEEE spoken language technology workshop (SLT) (pp. 112\u2013118). IEEE.","DOI":"10.1109\/SLT.2018.8639583"},{"issue":"2","key":"10079_CR44","doi-asserted-by":"publisher","DOI":"10.1016\/j.heliyon.2020.e03372","volume":"6","author":"B Zada","year":"2020","unstructured":"Zada, B., & Ullah, R. (2020). Pashto isolated digits recognition using deep convolutional neural network. Heliyon, 6(2), e03372.","journal-title":"Heliyon"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-023-10079-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10772-023-10079-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-023-10079-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,11]],"date-time":"2024-01-11T10:17:24Z","timestamp":1704968244000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10772-023-10079-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12]]},"references-count":44,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2023,12]]}},"alternative-id":["10079"],"URL":"https:\/\/doi.org\/10.1007\/s10772-023-10079-0","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"type":"print","value":"1381-2416"},{"type":"electronic","value":"1572-8110"}],"subject":[],"published":{"date-parts":[[2023,12]]},"assertion":[{"value":"24 August 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 November 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 December 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no relevant financial or non-financial interests to disclose.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}