{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,12]],"date-time":"2026-06-12T16:49:28Z","timestamp":1781282968315,"version":"3.54.1"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2024,10,11]],"date-time":"2024-10-11T00:00:00Z","timestamp":1728604800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2024,10,11]],"date-time":"2024-10-11T00:00:00Z","timestamp":1728604800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J AUDIO SPEECH MUSIC PROC."],"DOI":"10.1186\/s13636-024-00368-0","type":"journal-article","created":{"date-parts":[[2024,10,11]],"date-time":"2024-10-11T02:01:39Z","timestamp":1728612099000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["UTran-DSR: a novel transformer-based model using feature enhancement for dysarthric speech recognition"],"prefix":"10.1186","volume":"2024","author":[{"given":"Usama","family":"Irshad","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rabbia","family":"Mahum","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ismaila","family":"Ganiyu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Faisal Shafique","family":"Butt","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lotfi","family":"Hidri","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tamer G.","family":"Ali","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ahmed M.","family":"El-Sherbeeny","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,10,11]]},"reference":[{"key":"368_CR1","doi-asserted-by":"crossref","unstructured":"R. Mahum, A. Irtaza, A. Javed, EDL-Det: A Robust TTS Synthesis Detector Using VGG19-Based YAMNet and Ensemble Learning Block. IEEE Access 11, 134701\u2013134716 (2023)","DOI":"10.1109\/ACCESS.2023.3332561"},{"issue":"3","key":"368_CR2","doi-asserted-by":"publisher","first-page":"503","DOI":"10.1044\/jshd.5503.503","volume":"55","author":"S Sapir","year":"1990","unstructured":"S. Sapir, A.E. Aronson, The relationship between psychopathology and speech and language disorders in neurologic patients. J. Speech Hear. Disord. 55(3), 503\u2013509 (1990)","journal-title":"J. Speech Hear. Disord."},{"issue":"9","key":"368_CR3","doi-asserted-by":"publisher","first-page":"743","DOI":"10.1016\/j.specom.2007.05.001","volume":"49","author":"AB Kain","year":"2007","unstructured":"A.B. Kain et al., Improving the intelligibility of dysarthric speech. Speech Commun. 49(9), 743\u2013759 (2007)","journal-title":"Speech Commun."},{"key":"368_CR4","doi-asserted-by":"publisher","DOI":"10.1044\/1092-4388(2011\/11-0223)","volume-title":"Vocal tract representation in the recognition of cerebral palsied speech","author":"F Rudzicz","year":"2012","unstructured":"F. Rudzicz, G. Hirst, P. van Lieshout, Vocal tract representation in the recognition of cerebral palsied speech (2012)"},{"key":"368_CR5","volume-title":"In Interspeech","author":"MJ Kim","year":"2013","unstructured":"M.J. Kim, J. Yoo, H. Dim, Dysarthric speech recognition using dysarthria severity-dependent and speaker-adaptive models, in In Interspeech. (2013)"},{"issue":"5","key":"368_CR6","doi-asserted-by":"publisher","first-page":"716","DOI":"10.1080\/13682820802342062","volume":"44","author":"G Van Nuffelen","year":"2009","unstructured":"G. Van Nuffelen et al., Speech technology-based assessment of phoneme intelligibility in dysarthria. Int. J. Lang. Commun. Disord. 44(5), 716\u2013730 (2009)","journal-title":"Int. J. Lang. Commun. Disord."},{"issue":"3","key":"368_CR7","doi-asserted-by":"publisher","first-page":"400","DOI":"10.1111\/ijs.12067","volume":"10","author":"M Ali","year":"2015","unstructured":"M. Ali, P. Lyden, M. Brady, Aphasia and dysarthria in acute stroke: recovery and functional outcome. Int. J. Stroke 10(3), 400\u2013406 (2015)","journal-title":"Int. J. Stroke"},{"issue":"15","key":"368_CR8","doi-asserted-by":"publisher","first-page":"9089","DOI":"10.1007\/s00521-020-05672-2","volume":"33","author":"BF Zaidi","year":"2021","unstructured":"B.F. Zaidi et al., Deep neural network architectures for dysarthric speech analysis and recognition. Neural Comput. Appl. 33(15), 9089\u20139108 (2021)","journal-title":"Neural Comput. Appl."},{"key":"368_CR9","doi-asserted-by":"publisher","unstructured":"R. Mahum, A. Irtaza, A. Javed et al., DeepDet: YAMNet with BottleNeck Attention Module (BAM) for TTS synthesis detection. J AUDIO SPEECH MUSIC PROC 2024, 18 (2024). https:\/\/doi.org\/10.1186\/s13636-024-00335-9","DOI":"10.1186\/s13636-024-00335-9"},{"key":"368_CR10","doi-asserted-by":"crossref","unstructured":"F. Xiong, J. Barker, H. Christensen, Phonetic analysis of dysarthric speech tempo and applications to robust personalised dysarthric speech recognition, in ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). (IEEE, 2019, May), pp. 5836\u20135840","DOI":"10.1109\/ICASSP.2019.8683091"},{"key":"368_CR11","doi-asserted-by":"crossref","unstructured":"L. Dong, S. Xu, B. Xu, Speech-transformer: a no-recurrence sequence-to-sequence model for speech recognition, in 2018 IEEE international conference on acoustics, speech and signal processing (ICASSP). (IEEE, 2018, April), pp. 5884\u20135888","DOI":"10.1109\/ICASSP.2018.8462506"},{"key":"368_CR12","doi-asserted-by":"publisher","first-page":"319","DOI":"10.1016\/j.csl.2019.05.002","volume":"58","author":"E Y\u0131lmaz","year":"2019","unstructured":"E. Y\u0131lmaz et al., Articulatory and bottleneck features for speaker-independent ASR of dysarthric speech. Comput. Speech Lang. 58, 319\u2013334 (2019)","journal-title":"Comput. Speech Lang."},{"key":"368_CR13","doi-asserted-by":"publisher","first-page":"67745","DOI":"10.1109\/ACCESS.2020.2986171","volume":"8","author":"N Narendra","year":"2020","unstructured":"N. Narendra, P. Alku, Glottal source information for pathological voice detection. IEEE Access 8, 67745\u201367755 (2020)","journal-title":"IEEE Access"},{"key":"368_CR14","doi-asserted-by":"crossref","unstructured":"D. Bahdanau, J. Chorowski, D. Serdyuk, P. Brakel, Y. Bengio, End-to-end attention-based large vocabulary speech recognition, in 2016 IEEE international conference on acoustics, speech and signal processing (ICASSP). (IEEE, 2016), pp. 4945\u20134949","DOI":"10.1109\/ICASSP.2016.7472618"},{"issue":"6","key":"368_CR15","first-page":"384","volume":"13","author":"H Albaqshi","year":"2020","unstructured":"H. Albaqshi, A. Sagheer, Dysarthric speech recognition using convolutional recurrent neural networks. Int. J. Intell. Eng. Syst. 13(6), 384\u2013392 (2020)","journal-title":"Int. J. Intell. Eng. Syst."},{"key":"368_CR16","first-page":"1","volume":"46","author":"F Rudzicz","year":"2010","unstructured":"F. Rudzicz, A. Namasivayam, T. Wolff, The TORGO database of acoustic and articulatory speech from speakers with dysarthria. Lang. Resour. Eval. 46, 1\u201319 (2010)","journal-title":"Lang. Resour. Eval."},{"key":"368_CR17","doi-asserted-by":"crossref","unstructured":"Y. Takashima, T. Takiguchi, Y. Ariki, End-to-end dysarthric speech recognition using multiple databases, in ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). (IEEE, 2019), pp. 6395\u20136399","DOI":"10.1109\/ICASSP.2019.8683803"},{"key":"368_CR18","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s13636-019-0169-5","volume":"2020","author":"M Sidi Yakoub","year":"2020","unstructured":"M. Sidi Yakoub et al., Improving dysarthric speech recognition using empirical mode decomposition and convolutional neural network. EURASIP J. Audio Speech Music Proc. 2020, 1\u20137 (2020)","journal-title":"EURASIP J. Audio Speech Music Proc."},{"key":"368_CR19","doi-asserted-by":"crossref","unstructured":"R. Takashima, T. Takiguchi, Y. Ariki, Two-step acoustic model adaptation for dysarthric speech recognition, in ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). (IEEE, 2020), pp. 6104\u20136108","DOI":"10.1109\/ICASSP40776.2020.9053725"},{"key":"368_CR20","volume-title":"Proceedings of SLPAT 2015: 6th workshop on speech and language processing for assistive technologies","author":"S Hahm","year":"2015","unstructured":"S. Hahm, D. Heitzman, J. Wang, Recognizing dysarthric speech due to amyotrophic lateral sclerosis with across-speaker articulatory normalization, in Proceedings of SLPAT 2015: 6th workshop on speech and language processing for assistive technologies. (2015)"},{"issue":"9","key":"368_CR21","doi-asserted-by":"publisher","first-page":"1581","DOI":"10.1109\/TNSRE.2017.2681691","volume":"25","author":"M Kim","year":"2017","unstructured":"M. Kim et al., Regularized speaker adaptation of KL-HMM for dysarthric speech recognition. IEEE Trans. Neural Syst. Rehabil. Eng. 25(9), 1581\u20131591 (2017)","journal-title":"IEEE Trans. Neural Syst. Rehabil. Eng."},{"issue":"3","key":"368_CR22","doi-asserted-by":"publisher","first-page":"141","DOI":"10.1109\/6046.865479","volume":"2","author":"S Dupont","year":"2000","unstructured":"S. Dupont, J. Luettin, Audio-visual speech modeling for continuous speech recognition. IEEE Trans. Multimedia 2(3), 141\u2013151 (2000)","journal-title":"IEEE Trans. Multimedia"},{"key":"368_CR23","doi-asserted-by":"crossref","unstructured":"J. Yu, B. Wu, R. Gu, S.-X. Zhang, L. Chen, Y. Xu, M. Yu, D. Su, D. Yu, X. Liu, H. Meng, Audio-visual multi-channel recognition of overlapped speech (2020). arXiv preprint arXiv:2005.08571","DOI":"10.21437\/Interspeech.2020-2346"},{"key":"368_CR24","volume-title":"INTERSPEECH","author":"S Liu","year":"2019","unstructured":"S. Liu et al., Exploiting visual features using bayesian gated neural networks for disordered speech recognition, in INTERSPEECH. (2019)"},{"key":"368_CR25","doi-asserted-by":"crossref","unstructured":"C. Miyamoto, Y. Komai, T. Takiguchi, Y. Ariki, I. Li, Multimodal speech recognition of a person with articulation disorders using AAM and MAF, in 2010 IEEE International Workshop on Multimedia Signal Processing. (IEEE, 2010), pp. 517\u2013520","DOI":"10.1109\/MMSP.2010.5662075"},{"key":"368_CR26","volume-title":"Interspeech","author":"S Liu","year":"2020","unstructured":"S. Liu et al., Exploiting cross-domain visual feature generation for disordered speech recognition, in Interspeech. (2020)"},{"key":"368_CR27","doi-asserted-by":"publisher","first-page":"2267","DOI":"10.1109\/TASLP.2021.3091805","volume":"29","author":"S Liu","year":"2021","unstructured":"S. Liu et al., Recent progress in the cuhk dysarthric speech recognition system. IEEE\/ACM Trans. Audio Speech Lang. Process. 29, 2267\u20132281 (2021)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"368_CR28","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2024.103047","volume":"158","author":"F Javanmardi","year":"2024","unstructured":"F. Javanmardi, S.R. Kadiri, P. Alku, Pre-trained models for detection and severity level classification of dysarthria from speech. Speech Commun. 158, 103047 (2024)","journal-title":"Speech Commun."},{"issue":"1","key":"368_CR29","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1186\/s13636-024-00357-3","volume":"2024","author":"S Sajiha","year":"2024","unstructured":"S. Sajiha et al., Automatic dysarthria detection and severity level assessment using CWT-layered CNN model. EURASIP J. Audio Speech Music Process. 2024(1), 33 (2024)","journal-title":"EURASIP J. Audio Speech Music Process."},{"issue":"5","key":"368_CR30","doi-asserted-by":"publisher","first-page":"3261","DOI":"10.1007\/s00034-024-02611-7","volume":"43","author":"K Radha","year":"2024","unstructured":"K. Radha, M. Bansal, V.R. Dhulipalla, Variable STFT layered CNN model for automated dysarthria detection and severity assessment using raw speech. Circuits Syst. Signal Process. 43(5), 3261\u20133278 (2024)","journal-title":"Circuits Syst. Signal Process."},{"key":"368_CR31","doi-asserted-by":"publisher","first-page":"29","DOI":"10.1016\/B978-0-12-374136-3.00002-X","volume":"2009","author":"\u00c5 Rinnan","year":"2009","unstructured":"\u00c5. Rinnan et al., Data pre-processing. Infrared Spectrosc. Food Qual. Anal. Ctl. 2009, 29\u201350 (2009)","journal-title":"Infrared Spectrosc. Food Qual. Anal. Ctl."},{"key":"368_CR32","unstructured":"L. Perez, The effectiveness of data augmentation in image classification using deep learning (2017). arXiv preprint arXiv:1712.04621"},{"key":"368_CR33","doi-asserted-by":"publisher","unstructured":"B. Chen, Y. Liu, Z. Zhang, G. Lu, A.W.K. Kong, TransAttUnet: Multi-Level Attention-Guided U-Net With Transformer for Medical Image Segmentation, in IEEE Transactions on Emerging Topics in Computational Intelligence, vol. 8, no. 1, (2024), pp. 55\u201368. https:\/\/doi.org\/10.1109\/TETCI.2023.3309626","DOI":"10.1109\/TETCI.2023.3309626"},{"issue":"1","key":"368_CR34","doi-asserted-by":"publisher","first-page":"014006","DOI":"10.1117\/1.JMI.6.1.014006","volume":"6","author":"MZ Alom","year":"2019","unstructured":"M.Z. Alom et al., Recurrent residual U-Net for medical image segmentation. J. Med. Imaging 6(1), 014006\u2013014006 (2019)","journal-title":"J. Med. Imaging"},{"key":"368_CR35","volume-title":"In Proceedings of the IEEE conference on computer vision and pattern recognition","author":"K He","year":"2016","unstructured":"K. He et al., Deep residual learning for image recognition, in In Proceedings of the IEEE conference on computer vision and pattern recognition. (2016)"},{"key":"368_CR36","volume-title":"In Proceedings of the IEEE conference on computer vision and pattern recognition","author":"G Huang","year":"2017","unstructured":"G. Huang et al., Densely connected convolutional networks, in In Proceedings of the IEEE conference on computer vision and pattern recognition. (2017)"},{"key":"368_CR37","unstructured":"J. Devlin, M.-W. Chang, K. Lee, K. Toutanova, Bert: Pre-training of deep bidirectional transformers for language understanding, in Proceedings of naacL-HLT, vol. 1, (2019), p. 2"},{"key":"368_CR38","volume-title":"in Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","author":"L Xu","year":"2022","unstructured":"L. Xu et al., Multi-class token transformer for weakly supervised semantic segmentation, in in Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. (2022)"},{"key":"368_CR39","unstructured":"C.-Y. Lee, S. Xie, P. Gallagher, Z. Zhang, Z. Tu, Deeply-supervised nets, in Artificial intelligence and statistics. (Pmlr, 2015), pp. 562\u2013570"},{"key":"368_CR40","volume-title":"In Interspeech","author":"H Kim","year":"2008","unstructured":"H. Kim et al., Dysarthric speech database for universal access research, in In Interspeech. (2008)"},{"key":"368_CR41","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2020.101114","volume":"64","author":"X Wang","year":"2020","unstructured":"X. Wang et al., ASVspoof 2019: A large-scale public database of synthesized, converted and replayed speech. Comput. Speech Lang. 64, 101114 (2020)","journal-title":"Comput. Speech Lang."},{"key":"368_CR42","doi-asserted-by":"publisher","first-page":"523","DOI":"10.1007\/s10579-011-9145-0","volume":"46","author":"F Rudzicz","year":"2012","unstructured":"F. Rudzicz, A.K. Namasivayam, T. Wolff, The TORGO database of acoustic and articulatory speech from speakers with dysarthria. Lang. Resour. Eval. 46, 523\u2013541 (2012)","journal-title":"Lang. Resour. Eval."},{"key":"368_CR43","unstructured":"I. Loshchilov, Decoupled weight decay regularization (2017). arXiv preprint arXiv:1711.05101"},{"issue":"3","key":"368_CR44","doi-asserted-by":"publisher","first-page":"23","DOI":"10.13064\/KSSS.2019.11.3.023","volume":"11","author":"A Hernandez","year":"2019","unstructured":"A. Hernandez, H.-Y. Lee, M. Chung, Acoustic analysis of fricatives in dysarthric speakers with cerebral palsy. Phon. Speech Sci. 11(3), 23\u201329 (2019)","journal-title":"Phon. Speech Sci."},{"issue":"1","key":"368_CR45","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1007\/s11277-021-08899-x","volume":"122","author":"R Rajeswari","year":"2022","unstructured":"R. Rajeswari, T. Devi, S. Shalini, Dysarthric speech recognition using variational mode decomposition and convolutional neural networks. Wireless Pers. Commun. 122(1), 293\u2013307 (2022)","journal-title":"Wireless Pers. Commun."},{"key":"368_CR46","doi-asserted-by":"publisher","first-page":"47","DOI":"10.1016\/j.specom.2019.04.003","volume":"110","author":"N Narendra","year":"2019","unstructured":"N. Narendra, P. Alku, Dysarthric speech classification from coded telephone speech using glottal features. Speech Commun. 110, 47\u201355 (2019)","journal-title":"Speech Commun."},{"key":"368_CR47","doi-asserted-by":"publisher","unstructured":"S.R. Shahamiri, V. Lal, D. Shah, Dysarthric Speech Transformer: A Sequence-to-Sequence Dysarthric Speech Recognition System, in IEEE Transactions on Neural Systems and Rehabilitation Engineering, vol. 31, (2023), pp. 3407\u20133416. https:\/\/doi.org\/10.1109\/TNSRE.2023.3307020","DOI":"10.1109\/TNSRE.2023.3307020"},{"key":"368_CR48","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.119797","volume":"222","author":"A Almadhor","year":"2023","unstructured":"A. Almadhor et al., E2E-DASR: End-to-end deep learning-based dysarthric automatic speech recognition. Expert Syst. Appl. 222, 119797 (2023)","journal-title":"Expert Syst. Appl."},{"key":"368_CR49","doi-asserted-by":"crossref","unstructured":"S. Mehra, V. Ranga, R. Agarwal, A deep learning approach to dysarthric utterance classification with BiLSTM-GRU, speech cue filtering, and log mel spectrograms. The Journal of Supercomputing 80, 14520\u201314547 (2024)","DOI":"10.1007\/s11227-024-06015-x"}],"container-title":["EURASIP Journal on Audio, Speech, and Music Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s13636-024-00368-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1186\/s13636-024-00368-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s13636-024-00368-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,11]],"date-time":"2024-10-11T02:02:32Z","timestamp":1728612152000},"score":1,"resource":{"primary":{"URL":"https:\/\/asmp-eurasipjournals.springeropen.com\/articles\/10.1186\/s13636-024-00368-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,11]]},"references-count":49,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2024,12]]}},"alternative-id":["368"],"URL":"https:\/\/doi.org\/10.1186\/s13636-024-00368-0","relation":{},"ISSN":["1687-4722"],"issn-type":[{"value":"1687-4722","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,11]]},"assertion":[{"value":"12 June 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 August 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 October 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Not applicable.\u00a0All authors gave their consent to participate.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"All authors agreed to submit the manuscript for publication in the journal.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"The authors declare that they have no competing interests.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"54"}}