{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T19:16:25Z","timestamp":1757618185132,"version":"3.44.0"},"reference-count":43,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2025,5,30]],"date-time":"2025-05-30T00:00:00Z","timestamp":1748563200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,5,30]],"date-time":"2025-05-30T00:00:00Z","timestamp":1748563200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s10772-025-10194-0","type":"journal-article","created":{"date-parts":[[2025,5,30]],"date-time":"2025-05-30T08:44:20Z","timestamp":1748594660000},"page":"443-459","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["The importance of deep learning models in speech signal processing: fundamentals, strategies, and future research directions"],"prefix":"10.1007","volume":"28","author":[{"given":"Ling","family":"Pan","sequence":"first","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,30]]},"reference":[{"issue":"21","key":"10194_CR43","doi-asserted-by":"publisher","first-page":"8122","DOI":"10.3390\/s22218122","volume":"22","author":"AB Abdusalomov","year":"2022","unstructured":"Abdusalomov, A. B., Safarov, F., Rakhimov, M., Turaev, B., & Whangbo, T. K. (2022). Improved feature parameter extraction from speech signals using machine learning algorithm. Sensors (Basel, Switzerland), 22(21), 8122.","journal-title":"Sensors (Basel, Switzerland)"},{"issue":"11","key":"10194_CR9","doi-asserted-by":"publisher","first-page":"13521","DOI":"10.1007\/s10462-023-10466-8","volume":"56","author":"SF Ahmed","year":"2023","unstructured":"Ahmed, S. F., et al. (2023). Deep learning modelling techniques: Current progress, applications, advantages, and challenges. Artificial Intelligence Review, 56(11), 13521\u201313617.","journal-title":"Artificial Intelligence Review"},{"issue":"3","key":"10194_CR4","doi-asserted-by":"publisher","first-page":"789","DOI":"10.1007\/s10772-023-10048-7","volume":"26","author":"KA Al-Karawi","year":"2023","unstructured":"Al-Karawi, K. A., & Mohammed, D. Y. (2023). Using combined features to improve speaker verification in the face of limited reverberant data. International Journal of Speech Technology, 26(3), 789\u2013799.","journal-title":"International Journal of Speech Technology"},{"issue":"5","key":"10194_CR10","doi-asserted-by":"publisher","first-page":"1293","DOI":"10.1134\/S0022093022050027","volume":"58","author":"IG Andreeva","year":"2022","unstructured":"Andreeva, I. G., & Ogorodnikova, E. A. (2022). Auditory adaptation to speech signal characteristics. Journal of Evolutionary Biochemistry and Physiology, 58(5), 1293\u20131309.","journal-title":"Journal of Evolutionary Biochemistry and Physiology"},{"key":"10194_CR19","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1016\/j.procs.2020.08.027","volume":"176","author":"H Aouani","year":"2020","unstructured":"Aouani, H., & Ben Ayed, Y. (2020). Speech emotion recognition with deep learning. Procedia Computer Science, 176, 251\u2013260.","journal-title":"Procedia Comput Sci"},{"issue":"2","key":"10194_CR5","doi-asserted-by":"publisher","first-page":"23","DOI":"10.3390\/mining5020023","volume":"5","author":"A Azadi","year":"2025","unstructured":"Azadi, A., & Momayez, M. (2025). Simulating a weak rock mass by a constitutive model. Mining, 5(2), 23. https:\/\/doi.org\/10.3390\/mining5020023","journal-title":"Mining"},{"key":"10194_CR29","first-page":"27826","volume":"34","author":"A Baevski","year":"2021","unstructured":"Baevski, A., Hsu, W.-N., Conneau, A., & Auli, M. (2021). Unsupervised speech recognition. Advances in Neural Information Processing Systems, 34, 27826\u201327839.","journal-title":"Advances in Neural information Processing Systems"},{"key":"10194_CR25","doi-asserted-by":"publisher","unstructured":"Bagherabad, M. B., Rivandi, E., & Mehr, M. J. (2025). Machine learning for analyzing effects of various factors on business economic. TechRxiv. https:\/\/doi.org\/10.36227\/techrxiv.174429010.09842200\/v1","DOI":"10.36227\/techrxiv.174429010.09842200\/v1"},{"key":"10194_CR30","doi-asserted-by":"crossref","unstructured":"Ben Abdallah, A. A., Kabboudi, A., Kanoun, A., & Zaiem, S. (2024). Leveraging data collection and unsupervised learning for code-switched Tunisian Arabic automatic speech recognition. In IEEE international conference on acoustics, speech and signal processing (ICASSP 2024) (pp. 12607\u201312611) IEEE.","DOI":"10.1109\/ICASSP48485.2024.10445734"},{"key":"10194_CR16","doi-asserted-by":"crossref","unstructured":"Bennour, A., Boudraa, M., Siddiqi, I., Al-Sarem, M., Al-Shabi, M., & Ghabban, F. (2024). A deep learning framework for historical manuscripts writer identification using data-driven features. Multimedia Tools and Applications, 1\u201327.","DOI":"10.1007\/s11042-024-18187-y"},{"issue":"9","key":"10194_CR22","doi-asserted-by":"publisher","first-page":"4419","DOI":"10.3390\/app12094419","volume":"12","author":"V Bhardwaj","year":"2022","unstructured":"Bhardwaj, V., et al. (2022). Automatic speech recognition (ASR) systems for children: A systematic literature review. Applied Sciences, 12(9), 4419.","journal-title":"Applied Sciences"},{"key":"10194_CR37","doi-asserted-by":"publisher","first-page":"46082","DOI":"10.1109\/ACCESS.2022.3153469","volume":"10","author":"YW Chen","year":"2022","unstructured":"Chen, Y.-W., et al. (2022). CITISEN: A deep learning-based speech signal-processing mobile application. IEEE Access: Practical Innovations, Open Solutions, 10, 46082\u201346099.","journal-title":"Ieee Access: Practical Innovations, Open Solutions"},{"key":"10194_CR34","doi-asserted-by":"crossref","unstructured":"Chen, C., Hu, Y., Zhang, Q., Zou, H., Zhu, B., & Chng, E. S. (2023). Leveraging modality-specific representations for audio-visual speech recognition via reinforcement learning. In Proceedings of the AAAI conference on artificial intelligence (pp. 12607\u201312615).","DOI":"10.1609\/aaai.v37i11.26484"},{"key":"10194_CR27","doi-asserted-by":"publisher","first-page":"1412","DOI":"10.1109\/LSP.2022.3181971","volume":"29","author":"J Choi","year":"2022","unstructured":"Choi, J., & Chang, J. H. (2022). Supervised learning approach for explicit spatial filtering of speech. IEEE Signal Processing Letters, 29, 1412\u20131416.","journal-title":"Ieee Signal Processing Letters"},{"key":"10194_CR15","doi-asserted-by":"publisher","first-page":"1071","DOI":"10.1007\/s11831-019-09344-w","volume":"27","author":"S Dargan","year":"2020","unstructured":"Dargan, S., Kumar, M., Ayyagari, M. R., & Kumar, G. (2020). A survey of deep learning and its applications: A new paradigm to machine learning. Archives of Computational Methods in Engineering, 27, 1071\u20131092.","journal-title":"Archives of Computational Methods in Engineering"},{"key":"10194_CR3","doi-asserted-by":"publisher","first-page":"119871","DOI":"10.1016\/j.eswa.2023.119871","volume":"224","author":"S Hamsa","year":"2023","unstructured":"Hamsa, S., Shahin, I., Iraqi, Y., Damiani, E., Nassif, A. B., & Werghi, N. (2023). Speaker identification from emotional and noisy speech using learned voice segregation and speech VGG. Expert Systems with Applications, 224, 119871.","journal-title":"Expert Systems with Applications"},{"key":"10194_CR40","doi-asserted-by":"publisher","first-page":"109492","DOI":"10.1016\/j.apacoust.2023.109492","volume":"211","author":"C Hema","year":"2023","unstructured":"Hema, C., & Marquez, F. P. G. (2023). Emotional speech recognition using CNN and deep learning techniques. Applied Acoustics, 211, 109492.","journal-title":"Applied Acoustics"},{"key":"10194_CR36","doi-asserted-by":"crossref","unstructured":"Kadhim, I. B., Khaleel, M. F., Mahmood, Z. S., & Coran, A. N. N. (2022). Reinforcement learning for speech recognition using recurrent neural networks. In 2022 2nd Asian Conference on Innovation in Technology (ASIANCON) (pp. 1\u20135). IEEE.","DOI":"10.1109\/ASIANCON55314.2022.9908930"},{"key":"10194_CR42","doi-asserted-by":"publisher","first-page":"103109","DOI":"10.1016\/j.dsp.2021.103109","volume":"116","author":"H Kim","year":"2021","unstructured":"Kim, H., & Shin, J. W. (2021). Target exaggeration for deep learning-based speech enhancement. Digital Signal Processing, 116, 103109.","journal-title":"Digit Signal Process"},{"key":"10194_CR38","doi-asserted-by":"publisher","first-page":"102612","DOI":"10.1016\/j.bspc.2021.102612","volume":"68","author":"M Krecichwost","year":"2021","unstructured":"Krecichwost, M., Mocko, N., & Badura, P. (2021). Automated detection of sigmatism using deep learning applied to multichannel speech signal. Biomedical Signal Processing and Control, 68, 102612.","journal-title":"Biomedical Signal Processing and Control"},{"key":"10194_CR2","doi-asserted-by":"publisher","first-page":"108998","DOI":"10.1016\/j.engappai.2024.108998","volume":"136","author":"L Lazzaroni","year":"2024","unstructured":"Lazzaroni, L., Bellotti, F., & Berta, R. (2024). An embedded end-to-end voice assistant. Engineering Applications of Artificial Intelligence, 136, 108998.","journal-title":"Engineering Applications of Artificial Intelligence"},{"key":"10194_CR31","doi-asserted-by":"crossref","unstructured":"Liu, A. H., Hsu, W.-N., Auli, M., & Baevski, A. (2023). Towards end-to-end unsupervised speech recognition. In 2022 IEEE Spoken Language technology workshop (SLT) (pp. 221\u2013228). IEEE.","DOI":"10.1109\/SLT54892.2023.10023187"},{"key":"10194_CR1","doi-asserted-by":"crossref","unstructured":"Mehra, S., & Susan, S. (2022). Early fusion of phone embeddings for recognition of low-resourced accented speech. In 4th international conference on artificial intelligence and speech technology (AIST 2022) (pp. 1\u20135) IEEE.","DOI":"10.1109\/AIST55798.2022.10064735"},{"issue":"35","key":"10194_CR6","doi-asserted-by":"publisher","first-page":"82533","DOI":"10.1007\/s11042-024-18804-w","volume":"83","author":"S Mehra","year":"2024","unstructured":"Mehra, S., Ranga, V., Agarwal, R., & Susan, S. (2024a). Speaker independent recognition of low-resourced multilingual Arabic spoken words through hybrid fusion. Multimedia Tools and Applications, 83(35), 82533\u201382561.","journal-title":"Multimed Tools Appl"},{"issue":"6","key":"10194_CR18","doi-asserted-by":"publisher","first-page":"e70012","DOI":"10.1111\/coin.70012","volume":"40","author":"S Mehra","year":"2024","unstructured":"Mehra, S., Ranga, V., & Agarwal, R. (2024b). Multimodal integration of Mel spectrograms and text transcripts for enhanced automatic speech recognition: Leveraging extractive transformer-based approaches and late fusion strategies. Computational Intelligence, 40(6), e70012.","journal-title":"Computational Intelligence"},{"issue":"3","key":"10194_CR23","doi-asserted-by":"publisher","first-page":"2020","DOI":"10.1007\/s00034-024-02915-8","volume":"44","author":"S Mehra","year":"2025","unstructured":"Mehra, S., Ranga, V., & Agarwal, R. (2025). Dhivehi speech recognition: A multimodal approach for Dhivehi language in resource-constrained settings. Circuits, Systems, and Signal Processing, 44(3), 2020\u20132040.","journal-title":"Circuits Syst Signal Process"},{"key":"10194_CR13","doi-asserted-by":"publisher","first-page":"1368","DOI":"10.1109\/TASLP.2021.3066303","volume":"29","author":"D Michelsanti","year":"2021","unstructured":"Michelsanti, D., et al. (2021). An overview of deep-learning-based audio-visual speech enhancement and separation. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 29, 1368\u20131396.","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"issue":"1","key":"10194_CR39","doi-asserted-by":"publisher","first-page":"183","DOI":"10.3390\/s20010183","volume":"20","author":"Mustaqeem","year":"2019","unstructured":"Mustaqeem, & Kwon, S. (2019). A CNN-assisted enhanced audio signal processing for speech emotion recognition. Sensors, 20(1), 183.","journal-title":"Sensors (Basel, Switzerland)"},{"key":"10194_CR33","unstructured":"Ni, J., et al. (2024). Towards unsupervised speech recognition without pronunciation models. arXiv preprint arXiv:2406.08380."},{"key":"10194_CR14","doi-asserted-by":"publisher","first-page":"103357","DOI":"10.1016\/j.autcon.2020.103357","volume":"119","author":"Y Pan","year":"2020","unstructured":"Pan, Y., Zhang, G., & Zhang, L. (2020). A spatial-channel hierarchical deep learning network for pixel-level automated crack detection. Automation in Construction, 119, 103357.","journal-title":"Automation in Construction"},{"key":"10194_CR7","doi-asserted-by":"publisher","first-page":"108085","DOI":"10.1016\/j.chb.2023.108085","volume":"152","author":"G Pei","year":"2024","unstructured":"Pei, G., Shang, Q., Hua, S., Li, T., & Jin, J. (2024). EEG-based affective computing in virtual reality with a balancing of the computational efficiency and recognition accuracy. Computers in Human Behavior, 152, 108085.","journal-title":"Comput Human Behav"},{"key":"10194_CR26","doi-asserted-by":"crossref","unstructured":"Purushotham, U., Chethan, K. S., Manasa, S., & Meghana, U. (2020). Speech enhancement using semi-supervised learning. In 2020 international conference on intelligent engineering and management (ICIEM) (pp. 381\u2013385) IEEE.","DOI":"10.1109\/ICIEM48762.2020.9160287"},{"key":"10194_CR20","doi-asserted-by":"publisher","first-page":"10239","DOI":"10.1109\/ACCESS.2021.3051432","volume":"9","author":"C Quan","year":"2021","unstructured":"Quan, C., Ren, K., & Luo, Z. (2021). A deep learning based method for Parkinson\u2019s disease detection using dynamic features of speech. IEEE Access: Practical Innovations, Open Solutions, 9, 10239\u201310252.","journal-title":"Ieee Access: Practical Innovations, Open Solutions"},{"key":"10194_CR35","unstructured":"Rajapakshe, T., Latif, S., Rana, R., Khalifa, S., & Schuller, B. W. (2020). Deep reinforcement learning with pre-training for time-efficient training of automatic speech recognition. arXiv preprint arXiv:11172."},{"key":"10194_CR24","doi-asserted-by":"publisher","unstructured":"Rivandi, E. (2024). FinTech and the level of its adoption in different countries around the world. Available at SSRN. https:\/\/doi.org\/10.2139\/ssrn.5049827","DOI":"10.2139\/ssrn.5049827"},{"key":"10194_CR8","doi-asserted-by":"crossref","unstructured":"S\u00f6nmez, Y. \u00dc., & Varol, A. (2024). In-depth investigation of speech emotion recognition studies from past to present: The importance of emotion recognition from speech signal for AI. Intelligent Systems with Applications, 200351.","DOI":"10.1016\/j.iswa.2024.200351"},{"key":"10194_CR28","doi-asserted-by":"crossref","unstructured":"Trinh, N. H., & O\u2019Brien, D. (2020). Semi-supervised learning with generative adversarial networks for pathological speech classification. In  31st Irish signals and systems conference (ISSC 2020) (pp. 1\u20135) IEEE.","DOI":"10.1109\/ISSC49989.2020.9180211"},{"key":"10194_CR11","doi-asserted-by":"publisher","first-page":"101308","DOI":"10.1016\/j.csl.2021.101308","volume":"72","author":"A Wali","year":"2022","unstructured":"Wali, A., et al. (2022). Generative adversarial networks for speech processing: A review. Computer Speech & Language, 72, 101308.","journal-title":"Comput Speech Lang"},{"issue":"8","key":"10194_CR41","doi-asserted-by":"publisher","first-page":"1289","DOI":"10.1109\/JSTSP.2017.2756439","volume":"11","author":"B Wu","year":"2017","unstructured":"Wu, B., et al. (2017). An end-to-end deep learning approach to simultaneous speech dereverberation and acoustic modeling for robust speech recognition. IEEE Journal of Selected Topics in Signal Processing, 11(8), 1289\u20131300.","journal-title":"IEEE J Sel Top Signal Process"},{"key":"10194_CR12","doi-asserted-by":"publisher","first-page":"163829","DOI":"10.1109\/ACCESS.2020.3020421","volume":"8","author":"C Yu","year":"2020","unstructured":"Yu, C., Kang, M., Chen, Y., Wu, J., & Zhao, X. (2020). Acoustic modeling based on deep learning for low-resource speech recognition: An overview. IEEE Access: Practical Innovations, Open Solutions, 8, 163829\u2013163843.","journal-title":"Ieee Access: Practical Innovations, Open Solutions"},{"key":"10194_CR21","doi-asserted-by":"publisher","first-page":"111716","DOI":"10.1016\/j.rse.2020.111716","volume":"241","author":"Q Yuan","year":"2020","unstructured":"Yuan, Q., et al. (2020). Deep learning in environmental remote sensing: Achievements and challenges. Remote Sensing of Environment, 241, 111716.","journal-title":"Remote Sensing of Environment"},{"key":"10194_CR32","doi-asserted-by":"crossref","unstructured":"Zhang, R., et al. (2024). Unsupervised adaptive speaker recognition by coupling-regularized optimal transport. IEEE\/ACM Transactions on Audio, Speech, and Language Processing.","DOI":"10.1109\/TASLP.2024.3426934"},{"key":"10194_CR17","doi-asserted-by":"publisher","first-page":"106140","DOI":"10.1016\/j.bspc.2024.106140","volume":"93","author":"X Zhang","year":"2024","unstructured":"Zhang, X., & Xiao, H. (2024). Enhancing speech emotion recognition with the improved weighted average support vector method. Biomedical Signal Processing and Control, 93, 106140.","journal-title":"Biomedical Signal Processing and Control"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-025-10194-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10772-025-10194-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-025-10194-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T16:29:29Z","timestamp":1757176169000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10772-025-10194-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,30]]},"references-count":43,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["10194"],"URL":"https:\/\/doi.org\/10.1007\/s10772-025-10194-0","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"type":"print","value":"1381-2416"},{"type":"electronic","value":"1572-8110"}],"subject":[],"published":{"date-parts":[[2025,5,30]]},"assertion":[{"value":"13 October 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 April 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 May 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}]}}