{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T07:05:56Z","timestamp":1775199956030,"version":"3.50.1"},"reference-count":30,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434693","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-7","source":"Crossref","is-referenced-by-count":0,"title":["Beyond Modality Limitations: A Unified MLLM Approach to Automated Speaking Assessment with Effective Curriculum Learning"],"prefix":"10.1109","author":[{"given":"Yu-Hsuan","family":"Fang","sequence":"first","affiliation":[{"name":"National Taiwan Normal University"}]},{"given":"Tien-Hong","family":"Lo","sequence":"additional","affiliation":[{"name":"National Taiwan Normal University"}]},{"given":"Yao-Ting","family":"Sung","sequence":"additional","affiliation":[{"name":"National Taiwan Normal University"}]},{"given":"Berlin","family":"Chen","sequence":"additional","affiliation":[{"name":"National Taiwan Normal University"}]}],"member":"263","reference":[{"key":"ref1","article-title":"SALMONN: Towards generic hearing abilities for large language models","volume-title":"The Twelfth International Conference on Learning Representations","author":"Tang"},{"key":"ref2","article-title":"Qwen2-audio technical report","volume-title":"arXiv preprint arXiv:2407.10759","author":"Chu","year":"2024"},{"key":"ref3","article-title":"Phi-4-mini technical report: Compact yet powerful multimodal language models via mixture-of-loras","volume-title":"arXiv preprint arXiv:2503.01743","year":"2025"},{"key":"ref4","article-title":"Omni-r1: Do you really need audio to fine-tune your audio 11 m ?","author":"Rouditchenko","year":"2025","journal-title":"arXiv preprint arXiv:2505.09439"},{"key":"ref5","article-title":"Gpt-4 technical report","author":"Achiam","year":"2024"},{"key":"ref6","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Devlin"},{"key":"ref7","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref8","first-page":"705","article-title":"Automated scoring of spontaneous speech from young learners of english using transformers","volume-title":"2021 IEEE Spoken Language Technology Workshop, SLT 2021","author":"Wang"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10023019"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3706468.3706480"},{"key":"ref11","first-page":"1352","article-title":"An effective automated speaking assessment approach to mitigating data scarcity and imbalanced distribution","volume-title":"Findings of the Association for Computational Linguistics: NAACL 2024.","author":"Lo"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1146\/annurev-linguistics-030521-052114"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389696"},{"key":"ref14","first-page":"126","article-title":"Assessment of 12 oral proficiency using self-supervised speech representation learning","volume-title":"9th Workshop on Speech and Language Technology in Education (SLaTE).","author":"Bann\u00f2"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10245"},{"key":"ref16","doi-asserted-by":"crossref","DOI":"10.1145\/1553374.1553380","article-title":"Curriculum learning","volume-title":"International Conference on Machine Learning","author":"Bengio"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1980.1163420"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W15-0602"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1002\/j.2333-8504.2008.tb02148.x"},{"key":"ref20","first-page":"103","article-title":"Exploring content features for automated speech scoring","volume-title":"Proceedings of the 2012 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies.","author":"Xie"},{"key":"ref21","first-page":"174","article-title":"A preliminary study on automated speaking assessment of English as a second language (ESL) students","volume-title":"Proceedings of the 34th Conference on Computational Linguistics and Speech Processing (ROCLING 2022)","author":"Wu"},{"key":"ref22","first-page":"1024","article-title":"wavllm: Hierarchical curriculum learning for multimodal speaking assessment","volume":"32","author":"Chen","year":"2024","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"ref23","first-page":"506","article-title":"Oversampling, augmentation and curriculum learning for speaking assessment with limited training data","volume-title":"Proc. INTERSPEECH 2024","author":"Zhang"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01918"},{"key":"ref25","first-page":"12401","article-title":"Selfpowered ll m modality expansion for large speech-text models","volume-title":"Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing (EMNLP).","author":"Yu"},{"key":"ref26","first-page":"2048","article-title":"A speaking practice tool on teemi for automated english-speaking assessment of chinese learners","volume-title":"Proceedings of the Annual Conference of the International Speech Communication Association (INTERSPEECH)","author":"Chen"},{"key":"ref27","article-title":"Speak & improve corpus 2025: an 12 english speech corpus for language assessment and feedback","volume":"abs\/2412.11986","author":"Knill","year":"2024","journal-title":"ArXiv"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832269"},{"key":"ref29","article-title":"LoRA: Low-rank adaptation of large language models","volume-title":"International Conference on Learning Representations","author":"Hu"},{"key":"ref30","article-title":"Flashattention-2: Faster attention with better parallelism and work partitioning","volume-title":"The Twelfth International Conference on Learning Representations","author":"Dao"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434693.pdf?arnumber=11434693","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:58:30Z","timestamp":1775192310000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434693\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":30,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434693","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}