{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T07:05:11Z","timestamp":1775199911138,"version":"3.50.1"},"reference-count":75,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434680","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["Joint ASR and Speech Attribute Prediction for Conversational Dysarthric Speech Analysis with Multimodal Language Models"],"prefix":"10.1109","author":[{"given":"Dominik","family":"Wagner","sequence":"first","affiliation":[{"name":"Technische Hochschule N&#x00FC;rnberg"}]},{"given":"Ilja","family":"Baumann","sequence":"additional","affiliation":[{"name":"Technische Hochschule N&#x00FC;rnberg"}]},{"given":"Natalie","family":"Engert","sequence":"additional","affiliation":[{"name":"Technische Hochschule N&#x00FC;rnberg"}]},{"given":"Elmar","family":"N\u00f6th","sequence":"additional","affiliation":[{"name":"FAU Erlangen-N&#x00FC;rnberg"}]},{"given":"Korbinian","family":"Riedhammer","sequence":"additional","affiliation":[{"name":"Technische Hochschule N&#x00FC;rnberg"}]},{"given":"Tobias","family":"Bocklet","sequence":"additional","affiliation":[{"name":"Technische Hochschule N&#x00FC;rnberg"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Pushing the limits of semi-supervised learning for automatic speech recognition","author":"Zhang","year":"2022"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3182537"},{"key":"ref3","article-title":"Robust Speech Recognition via Large-Scale Weak Supervision","author":"Radford","year":"2022"},{"key":"ref4","article-title":"wav2vec 2.0: a framework for self-supervised learning of speech representations","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems","author":"Baevski"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1186\/s13636-023-00318-2"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2391"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2013-324"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888041"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2003.1198933"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1044\/jshr.1202.246"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1016\/j.jns.2016.08.048"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1044\/2021_JSLHR-21-00123"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1044\/2021_JSLHR-20-00617"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1111\/1460-6984.12607"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.14744\/SEMB.2023.29560"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1044\/jshr.0704.325"},{"key":"ref18","article-title":"Dysarthria: Best practices for assessing intelligibility","author":"Dahl","year":"2024"},{"key":"ref19","article-title":"Gpt-4 technical report","volume-title":"OpenAI","year":"2024"},{"key":"ref20","article-title":"Gemini: A family of highly capable multimodal models","year":"2025"},{"key":"ref21","article-title":"A comprehensive review of multimodal large language models: Performance and challenges across different tasks","author":"Wang","year":"2024"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i21.30570"},{"key":"ref23","article-title":"Moshi: a speech-text foundation model for real-time dialogue","author":"D\u00b4efossez","year":"2024"},{"key":"ref24","article-title":"Qwen2.5-omni technical report","author":"Xu","year":"2025"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0795"},{"key":"ref26","article-title":"Qwen-audio: Advancing universal audio understanding via unified large-scale audio-language models","author":"Chu","year":"2023","journal-title":"arXiv:2311.07919."},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446224"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1001\/jama.2023.9618"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.2196\/47551"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1001\/jama.2023.9458"},{"key":"ref31","article-title":"Phi-4-mini technical report: Compact yet powerful multimodal language models via mixture-of-loras","volume-title":"Microsoft"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1044\/2024_JSLHR-24-00122"},{"key":"ref33","article-title":"LoRA: Low-Rank Adaptation of Large Language Models","volume-title":"International Conference on Learning Representations","author":"Hu"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-3015"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-48309-7_46"},{"key":"ref37","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2025-2155","article-title":"Personalized fine-tuning with controllable synthetic speech from llm-generated transcripts for dysarthric speech recognition","author":"Wagner","year":"2025"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10896"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10674"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3422839"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00696"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1969"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.3390\/app15042006"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/TNSRE.2023.3307020"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10889046"},{"key":"ref46","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2025-1553","article-title":"Exploring generative error correction for dysarthric speech recognition","author":"Quatra","year":"2025"},{"issue":"1","key":"ref47","article-title":"Scaling instruction-finetuned language models","volume":"25","author":"Chung","year":"2024","journal-title":"J. Mach. Learn. Res."},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1427"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1751"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10335"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1645"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10094605"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.3390\/app10196999"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-45170-6_74"},{"key":"ref56","doi-asserted-by":"crossref","DOI":"10.1044\/2025_PERSP-25-00030","article-title":"Applications of artificial intelligence for cross-language intelligibility assessment of dysarthric speech","author":"Yeo","year":"2025"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.3389\/fdgth.2024.1440986"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSPW62465.2024.10626129"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10889515"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.21437\/odyssey.2024-31"},{"key":"ref61","article-title":"Qwen2-audio technical report","author":"Chu","year":"2024"},{"key":"ref62","article-title":"Audio Flamingo: a novel audio language model with few-shot learning and dialogue abilities","volume-title":"Proceedings of the 41st International Conference on Machine Learning. JMLR.org","author":"Kong"},{"key":"ref63","article-title":"SALMONN: Towards generic hearing abilities for large language models","volume-title":"The Twelfth International Conference on Learning Representations","author":"Tang"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2013-313"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.298"},{"key":"ref67","article-title":"Gaussian error linear units (gelus)","author":"Hendrycks","year":"2023"},{"key":"ref68","article-title":"Decoupled weight decay regularization","volume-title":"International Conference on Learning Representations","author":"Loshchilov"},{"key":"ref69","article-title":"BERTScore: Evaluating text generation with BERT","volume-title":"International Conference on Learning Representations","author":"Zhang"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00576"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-102"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10908"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.63317\/3che8vy5uqt7"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1016\/j.compbiomed.2023.107559"},{"key":"ref75","article-title":"Cr-ctc: Consistency regularization on ctc for improved speech recognition","author":"Yao","year":"2025"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434680.pdf?arnumber=11434680","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:58:15Z","timestamp":1775192295000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434680\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":75,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434680","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}