{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T06:19:40Z","timestamp":1774419580810,"version":"3.50.1"},"reference-count":38,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,4,6]]},"DOI":"10.1109\/icassp49660.2025.10889764","type":"proceedings-article","created":{"date-parts":[[2025,3,12]],"date-time":"2025-03-12T13:52:43Z","timestamp":1741787563000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["Toward Visual Pronunciation Learning: A Speech-to-Articulatory Animation Pipeline Leveraging wav2vec 2.0 and rtMRI Landmarks"],"prefix":"10.1109","author":[{"given":"Mushaffa Rasyid","family":"Ridha","sequence":"first","affiliation":[{"name":"Japan Adv. Inst. of Science &amp; Tech.,Grad. School of Adv. Science &amp; Tech.,Nomi,Japan"}]},{"given":"Shinobu","family":"Hasegawa","sequence":"additional","affiliation":[{"name":"Japan Adv. Inst. of Science &amp; Tech.,Center of IDER,Nomi,Japan"}]},{"given":"Sakriani","family":"Sakti","sequence":"additional","affiliation":[{"name":"Inst. of Science &amp; Tech.,Grad. School of Science &amp; Tech. Nara,Ikoma,Japan"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Computer Assisted Pronunciation Training (CAPT): A Systematic Review of Studies from 2012 to 2022","year":"2022"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-337"},{"key":"ref3","first-page":"1","article-title":"Pronunciation error detection using DNN articulatory model based on multi-lingual and multi-task learning","volume-title":"Proc. IEEE International Symposium on Chinese Spoken Language Processing (ISCSLP)","author":"Duan"},{"key":"ref4","volume-title":"The sounds of the world\u2019s languages","author":"Ladefoged","year":"1996"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1121\/1.2029064"},{"issue":"3","key":"ref6","first-page":"1666","article-title":"Anatomic development of the oral and pharyngeal portions of the vocal tract: An imaging study","volume-title":"The Journal of the Acoustical Society of America","volume":"125","author":"Vorperian","year":"2009"},{"issue":"6","key":"ref7","first-page":"3078","article-title":"Electromagnetic midsagittal articulometer systems for transducing speech articulatory movements","volume-title":"The Journal of the Acoustical Society of America","volume":"92","author":"Perkell","year":"1992"},{"key":"ref8","first-page":"235","article-title":"The DKU-JNU-EMA electromagnetic articulography database on mandarin and chinese dialects with tandem feature based acoustic-to-articulatory inversion","volume-title":"Proc. IEEE International Symposium on Chinese Spoken Language Processing (ISCSLP)","author":"Cai"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1121\/1.4890284"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/SLaTE.2023-14"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01976"},{"key":"ref12","first-page":"13796","article-title":"Refining rtmri landmark-based vocal tract contour labels with fcn-based smoothing and point-to-curve projection","volume-title":"Proc. of the Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)","author":"Ridha"},{"key":"ref13","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Baevski","year":"2020"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1121\/1.1909864"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1202"},{"key":"ref16","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2024-2191","article-title":"Towards a quantitative analysis of coarticulation with a phoneme-to-articulatory model","author":"Fan","year":"2024"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-184"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448411"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1250\/ast.36.428"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178893"},{"key":"ref21","article-title":"Speaker dependent acoustic-to-articulatory inversion using real-time mri of the vocal tract","author":"Csap\u00f3","year":"2020"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2024-1517"},{"key":"ref23","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2019-1873","article-title":"wav2vec: Unsupervised pre-training for speech recognition","author":"Schneider","year":"2019"},{"key":"ref24","doi-asserted-by":"crossref","DOI":"10.1109\/TASLP.2021.3122291","article-title":"Hubert: Self-supervised speech representation learning by masked prediction of hidden units","author":"Hsu","year":"2021"},{"key":"ref25","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2021-1280","article-title":"Exploring wav2vec 2.0 on speaker verification and language identification","author":"Fan","year":"2021"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053569"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-777"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2023.3317236"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10245"},{"key":"ref31","doi-asserted-by":"crossref","DOI":"10.23919\/APSIPAASC55919.2022.9979979","article-title":"3m: An effective multi-view, multi-granularity, and multi-aspect modeling approach to english pronunciation assessment","author":"Chao","year":"2022"},{"key":"ref32","article-title":"Effectiveness of self-supervised pre-training for speech recognition","author":"Baevski","year":"2020"},{"key":"ref33","article-title":"Usc-timit: A database of multimodal speech production data","author":"Narayanan","year":"2014"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.21437\/ICSLP.2000-772"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/TMI.2008.928920"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1207\/s15326969eco0104_2"},{"key":"ref37","article-title":"Pytorch: An imperative style, high-performance deep learning library","author":"Paszke","year":"2019"},{"key":"ref38","article-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2017"}],"event":{"name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Hyderabad, India","start":{"date-parts":[[2025,4,6]]},"end":{"date-parts":[[2025,4,11]]}},"container-title":["ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10887540\/10887541\/10889764.pdf?arnumber=10889764","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T05:23:37Z","timestamp":1774416217000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10889764\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,6]]},"references-count":38,"URL":"https:\/\/doi.org\/10.1109\/icassp49660.2025.10889764","relation":{},"subject":[],"published":{"date-parts":[[2025,4,6]]}}}