{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,17]],"date-time":"2026-06-17T04:29:01Z","timestamp":1781670541006,"version":"3.54.5"},"reference-count":55,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100017582","name":"Beijing National Research Center For Information Science And Technology","doi-asserted-by":"publisher","award":["BNR2021KF02005"],"award-info":[{"award-number":["BNR2021KF02005"]}],"id":[{"id":"10.13039\/501100017582","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2022]]},"DOI":"10.1109\/access.2022.3212417","type":"journal-article","created":{"date-parts":[[2022,10,5]],"date-time":"2022-10-05T15:33:20Z","timestamp":1664984000000},"page":"106451-106462","source":"Crossref","is-referenced-by-count":11,"title":["Self-Supervised Pre-Trained Speech Representation Based End-to-End Mispronunciation Detection and Diagnosis of Mandarin"],"prefix":"10.1109","volume":"10","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2085-0163","authenticated-orcid":false,"given":"Yunfei","family":"Shen","sequence":"first","affiliation":[{"name":"School of Software, Xinjiang University, Urumqi, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Qingqing","family":"Liu","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multilingual Information Technology in Xinjiang Uyghur Autonomous Region, Urumqi, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhixing","family":"Fan","sequence":"additional","affiliation":[{"name":"School of Software, Xinjiang University, Urumqi, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jiajun","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Software, Xinjiang University, Urumqi, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1681-1089","authenticated-orcid":false,"given":"Aishan","family":"Wumaier","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multilingual Information Technology in Xinjiang Uyghur Autonomous Region, Urumqi, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"The Putonghua Training and Testing Center of National Language and Writing Committee","author":"Jia","year":"2004"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-6393(99)00044-8"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2014.12.008"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1121\/1.5011159"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2012.6424254"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639269"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2008-471"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2012-238"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1016\/0022-2836(70)90057-4"},{"key":"ref10","volume-title":"A study of key technologies to freely spoken Mandarin speech evaluation","author":"Xu","year":"2016"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2018-1110"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref13","article-title":"Deep speech: Scaling up end-to-end speech recognition","author":"Hannun","year":"2014","journal-title":"arXiv:1412.5567"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2017.2763455"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383623"},{"key":"ref16","article-title":"Sequence transduction with recurrent neural networks","author":"Graves","year":"2012","journal-title":"arXiv:1211.3711"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"ref18","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"arXiv:1706.03762"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462506"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053896"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003750"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.3390\/s20071809"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682654"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052975"},{"key":"ref26","article-title":"A full text-dependent end to end mispronunciation detection and diagnosis with easy data augmentation techniques","author":"Fu","year":"2021","journal-title":"arXiv:2104.08428"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2021.04.004"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413953"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2006.1660023"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-647"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO54536.2021.9615987"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-1446"},{"key":"ref33","article-title":"Wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"Baevski","year":"2020","journal-title":"arXiv:2006.11477"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1344"},{"key":"ref35","article-title":"Vq-wav2vec: Self-supervised learning of discrete speech representations","author":"Baevski","year":"2019","journal-title":"arXiv:1910.05453"},{"key":"ref36","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018","journal-title":"arXiv:1810.04805"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.167"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639655"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2021.3090866"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-1473"},{"key":"ref41","article-title":"Improving transformer-based speech recognition using unsupervised pre-training","author":"Jiang","year":"2019","journal-title":"arXiv:1910.09932"},{"key":"ref42","article-title":"Representation learning with contrastive predictive coding","author":"van den Oord","year":"2018","journal-title":"arXiv:1807.03748"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref45","article-title":"WavLM: Large-scale self-supervised pre-training for full stack speech processing","author":"Chen","year":"2021","journal-title":"arXiv:2110.13900"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413888"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-148"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1259"},{"issue":"9","key":"ref49","first-page":"341","article-title":"Praat, a system for doing phonetics by computer","volume":"5","author":"Boersma","year":"2002","journal-title":"Glot Int."},{"key":"ref50","article-title":"THCHS-30: A free Chinese speech corpus","author":"Wang","year":"2015","journal-title":"arXiv:1512.01882"},{"key":"ref51","article-title":"SpeechBrain: A general-purpose speech toolkit","author":"Ravanelli","year":"2021","journal-title":"arXiv:2106.04624"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2022-143"},{"key":"ref53","article-title":"The Kaldi speech recognition toolkit","volume-title":"Proc. IEEE Workshop Autom. Speech Recognit. Understand.","author":"Povey"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953152"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6287639\/9668973\/09912419.pdf?arnumber=9912419","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,25]],"date-time":"2025-11-25T18:29:43Z","timestamp":1764095383000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9912419\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"references-count":55,"URL":"https:\/\/doi.org\/10.1109\/access.2022.3212417","relation":{},"ISSN":["2169-3536"],"issn-type":[{"value":"2169-3536","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]}}}