{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:09:35Z","timestamp":1765008575340,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":28,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China","award":["62466058"],"award-info":[{"award-number":["62466058"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,9]]},"DOI":"10.1145\/3743093.3771002","type":"proceedings-article","created":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:06:16Z","timestamp":1765008376000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Phoneme-Controlled LLM with Self-Supervised Speech Prompts for Mispronunciation Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-7714-5330","authenticated-orcid":false,"given":"Zhengping","family":"Song","sequence":"first","affiliation":[{"name":"Xinjiang University, Urumqi, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9475-1101","authenticated-orcid":false,"given":"Zaokere","family":"Kadeer","sequence":"additional","affiliation":[{"name":"Xinjiang University, Urumqi, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0973-4505","authenticated-orcid":false,"given":"Mulati","family":"Kahaer","sequence":"additional","affiliation":[{"name":"Xinjiang University, Urumqi, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3562-2876","authenticated-orcid":false,"given":"XuDong","family":"Pang","sequence":"additional","affiliation":[{"name":"Xinjiang University, Urumqi, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3089-4140","authenticated-orcid":false,"given":"Yinfeng","family":"Yu","sequence":"additional","affiliation":[{"name":"Xinjiang University, Urumqi, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1681-1089","authenticated-orcid":false,"given":"Aishan","family":"Wumaier","sequence":"additional","affiliation":[{"name":"Xinjiang University, Urumqi, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,12,6]]},"reference":[{"key":"e_1_3_3_1_2_2","first-page":"12449","volume-title":"Advances in Neural Information Processing Systems","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. In Advances in Neural Information Processing Systems , H.\u00a0Larochelle, M.\u00a0Ranzato, R.\u00a0Hadsell, M.F. Balcan, and H.\u00a0Lin (Eds.), Vol.\u00a033. Curran Associates, Inc., 12449\u201312460. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2020\/file\/92d1e1eb1cd6f9fba3227870bb6d7f07-Paper.pdf"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.4324\/9781315833774"},{"key":"e_1_3_3_1_4_2","unstructured":"Yunfei Chu Jin Xu Qian Yang Haojie Wei Xipin Wei Zhifang Guo Yichong Leng Yuanjun Lv Jinzheng He Junyang Lin Chang Zhou and Jingren Zhou. 2024. Qwen2-Audio Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2407.10759\u00a0[eess.AS] https:\/\/arxiv.org\/abs\/2407.10759"},{"key":"e_1_3_3_1_5_2","unstructured":"Danny Driess Fei Xia Mehdi S.\u00a0M. Sajjadi Corey Lynch Aakanksha Chowdhery Brian Ichter Ayzaan Wahid Jonathan Tompson Quan Vuong Tianhe Yu Wenlong Huang Yevgen Chebotar Pierre Sermanet Daniel Duckworth Sergey Levine Vincent Vanhoucke Karol Hausman Marc Toussaint Klaus Greff Andy Zeng Igor Mordatch and Pete Florence. 2023. PaLM-E: An Embodied Multimodal Language Model. (2023)."},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"Yassir Fathullah Chunyang Wu Egor Lakomkin Junteng Jia Yuan Shangguan Ke Li Jinxi Guo Wenhan Xiong Jay Mahadeokar Ozlem Kalinli Christian Fuegen and Mike Seltzer. 2023. Prompting Large Language Models with Speech Recognition Abilities. (2023).","DOI":"10.1109\/ICASSP48485.2024.10447605"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052975"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"crossref","unstructured":"John\u00a0S. Garofolo Lori\u00a0F. Lamel William\u00a0M. Fisher Jonathon\u00a0G. Fiscus and Nancy\u00a0L. Dahlgren. 1993. The DARPA TIMIT Acoustic-Phonetic Continuous Speech Corpus CDROM. Linguistic Data Consortium 1993 (1993).","DOI":"10.6028\/NIST.IR.4930"},{"key":"e_1_3_3_1_9_2","unstructured":"Gemini\u00a0Team Google Rohan Anil Sebastian Borgeaud Yonghui Wu Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew\u00a0M. Dai Anja Hauth Katie Millican David Silver Slav Petrov Melvin Johnson Ioannis Antonoglou Julian Schrittwieser Amelia Glaese Jilin Chen Emily Pitler Timothy Lillicrap Angeliki Lazaridou Orhan Firat James Molloy Michael Isard Paul\u00a0R. Barham Tom Hennigan Benjamin Lee Fabio Viola Malcolm Reynolds Yuanzhong Xu Ryan Doherty Eli Collins Clemens Meyer Eliza Rutherford Erica Moreira Kareem Ayoub Megha Goel George Tucker Enrique Piqueras Maxim Krikun Iain Barr Nikolay Savinov Ivo Danihelka Becca Roelofs Ana\u00efs White Anders Andreassen Tamara von Glehn Lakshman Yagati Mehran Kazemi Lucas Gonzalez Misha Khalman Jakub Sygnowski Alexandre Frechette Charlotte Smith Laura Culp Lev Proleev Yi Luan Xi Chen James Lottes Nathan Schucher Federico Lebron Alban Rrustemi Natalie Clay Phil Crone Tomas Kocisky Jeffrey Zhao Bartek Perz Dian Yu Heidi Howard Adam Bloniarz and Jack\u00a0W. Rae. 2023. Gemini: A Family of Highly Capable Multimodal Models. (2023)."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"crossref","unstructured":"Bo Guan Jin Cao Bingjie Huang Zhuoyue Wang Xingqi Wang and Zixiang Wang. 2024. Integrated Method of Deep learning and Large Language Model in Speech Recognition. (2024).","DOI":"10.20944\/preprints202407.1520.v2"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Wenping) Hu WP\u00a0(Hu Yao) Qian Y\u00a0(Qian Frank\u00a0K.) Soong FK\u00a0(Soong and Yong) Wang Y\u00a0(Wang. 2015. Improved mispronunciation detection with deep neural network trained acoustic models and transfer learning based logistic regression classifiers.Speech Communication (2015) 154\u2013166.","DOI":"10.1016\/j.specom.2014.12.008"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"crossref","unstructured":"Hao) Huang H\u00a0(Huang Haihua) Xu HH\u00a0(Xu Ying) Hu Y\u00a0(Hu and Gang) Zhou G\u00a0(Zhou. 2017. A transfer learning approach to goodness of pronunciation based automatic mispronunciation detection. The Journal of the Acoustical Society of America (2017) 3165. Issue 5.","DOI":"10.1121\/1.5011159"},{"key":"e_1_3_3_1_14_2","unstructured":"Rongjie Huang Mingze Li Dongchao Yang Jiatong Shi Xuankai Chang Zhenhui Ye Yuning Wu Zhiqing Hong Jiawei Huang Jinglin Liu Yi Ren Zhou Zhao and Shinji Watanabe. 2023. AudioGPT: Understanding and Generating Speech Music Sound and Talking Head. (2023)."},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472858"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"W.-K. Leung X. Liu and H. Meng. 2019. CNN-RNN-CTC Based End-to-end Mispronunciation Detection and Diagnosis(Conference Paper). ICASSP IEEE International Conference on Acoustics Speech and Signal Processing - Proceedings (2019) 8132\u20138136.","DOI":"10.1109\/ICASSP.2019.8682654"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746727"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"crossref","unstructured":"Zhang Long Zhao Ziping Ma Chunmei Shan Linlin Sun Huazhi Jiang Lifen Deng Shiwen and Gao Chang. 2020. End-to-End Automatic Pronunciation Error Detection Based on Improved Hybrid CTC\/Attention Architecture.Sensors (Basel Switzerland) (2020) 1809. Issue 7.","DOI":"10.3390\/s20071809"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","unstructured":"Zhiliang Peng Zonghao Guo Wei Huang Yaowei Wang Lingxi Xie Jianbin Jiao Qi Tian and Qixiang Ye. 2023. Conformer: Local Features Coupling Global Representations for Recognition and Detection. IEEE Transactions on Pattern Analysis and Machine Intelligence 45 8 (2023) 9454\u20139468. 10.1109\/TPAMI.2023.3243048","DOI":"10.1109\/TPAMI.2023.3243048"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2012-238"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"crossref","unstructured":"Nancy Stenson Bruce Downing Jan Smith and Karin Smith. 1991. The effectiveness of computer-assisted pronunciation training. Calico Journal 9 4 (1991) 5\u201319.","DOI":"10.1558\/cj.v9i4.5-19"},{"key":"e_1_3_3_1_22_2","unstructured":"Yixuan Su Tian Lan Huayang Li Jialu Xu Yan Wang and Deng Cai. 2023. PandaGPT: One Model To Instruction-Follow Them All. (2023)."},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Yow-Bang Wang and Lin shan Lee. 2015. Supervised Detection and Unsupervised Discovery of Pronunciation Error Patterns for Computer-Assisted Language Learning. IEEE\/ACM Transactions on Audio Speech and Language Processing (2015) 564\u2013579. Issue 3.","DOI":"10.1109\/TASLP.2014.2387413"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"crossref","unstructured":"S.M Witt and S.J Young. 2000. Phone-level pronunciation scoring and assessment for interactive language learning. Speech Communication (2000) 95\u2013108. Issue 2.","DOI":"10.1016\/S0167-6393(99)00044-8"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1467"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO54536.2021.9615987"},{"key":"e_1_3_3_1_27_2","unstructured":"Wenxuan Ye Shaoguang Mao Frank Soong Wenshan Wu Yan Xia Jonathan Tien and Zhiyong Wu. 2021. An Approach to Mispronunciation Detection and Diagnosis with Acoustic Phonetic and Linguistic (APL) Embeddings. (2021)."},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1110"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Cuicui Zhu Aishan Wumaier Dongping Wei Zhixing Fan Jianlei Yang Heng Yu Zaokere Kadeer and Liejun Wang. 2024. Pronunciation error detection model based on feature fusion. Speech Communication 156 (2024) 103009.","DOI":"10.1016\/j.specom.2023.103009"}],"event":{"name":"MMAsia '25: ACM Multimedia Asia","location":"Kuala Lumpur Malaysia","acronym":"MMAsia '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 7th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3743093.3771002","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:06:35Z","timestamp":1765008395000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3743093.3771002"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":28,"alternative-id":["10.1145\/3743093.3771002","10.1145\/3743093"],"URL":"https:\/\/doi.org\/10.1145\/3743093.3771002","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]},"assertion":[{"value":"2025-12-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}