{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:31:37Z","timestamp":1776889897246,"version":"3.51.2"},"reference-count":91,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"8","license":[{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"am","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100006435","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2106928"],"award-info":[{"award-number":["2106928"]}],"id":[{"id":"10.13039\/100006435","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Society of Hellman Fellows"},{"name":"Rose Hills Innovator and UC Noyce Initiative Programs at UC Berkeley"},{"name":"BAIR Commons-Meta AI Research"},{"name":"Google Research Scholar Award and Robert E. and Beverly A. Brooks Endowed Chair in EECS"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE J. Sel. Top. Signal Process."],"published-print":{"date-parts":[[2024,12]]},"DOI":"10.1109\/jstsp.2024.3497655","type":"journal-article","created":{"date-parts":[[2024,11,20]],"date-time":"2024-11-20T19:02:47Z","timestamp":1732129367000},"page":"1427-1440","source":"Crossref","is-referenced-by-count":8,"title":["Coding Speech Through Vocal Tract Kinematics"],"prefix":"10.1109","volume":"18","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2596-3164","authenticated-orcid":false,"given":"Cheol Jun","family":"Cho","sequence":"first","affiliation":[{"name":"Department of Electrical Engineering and Computer Sciences, University of California, Berkeley, CA, USA"}]},{"given":"Peter","family":"Wu","sequence":"additional","affiliation":[{"name":"Department of Electrical Engineering and Computer Sciences, University of California, Berkeley, CA, USA"}]},{"given":"Tejas S.","family":"Prabhune","sequence":"additional","affiliation":[{"name":"Department of Electrical Engineering and Computer Sciences, University of California, Berkeley, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1276-4891","authenticated-orcid":false,"given":"Dhruv","family":"Agarwal","sequence":"additional","affiliation":[{"name":"Department of Electrical Engineering and Computer Sciences, University of California, Berkeley, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9714-7740","authenticated-orcid":false,"given":"Gopala K.","family":"Anumanchipalli","sequence":"additional","affiliation":[{"name":"Department of Electrical Engineering and Computer Sciences, University of California, Berkeley, CA, USA"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1159\/000261913"},{"key":"ref2","article-title":"The vowel, its nature and structure","author":"Chiba","year":"1958","journal-title":"Phonetic Soc. Jpn."},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1515\/9783110873429"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/978-94-009-2037-8_6"},{"key":"ref5","volume-title":"Handbook of the International Phonetic Association: A Guide to the Use of the International Phonetic Alphabet","year":"1999"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/j.neuron.2018.04.031"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-019-1119-1"},{"key":"ref8","first-page":"5661","article-title":"Neural latent aligner: Cross-trial alignment for learning representations of complex, naturalistic neural data","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Cho","year":"2023"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10094711"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447345"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0060603"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2021-18"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10892"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095404"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3372874"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096796"},{"key":"ref18","first-page":"16251","article-title":"Neural analysis and synthesis: Reconstructing speech from self-supervised representations","volume":"34","author":"Choi","year":"2021","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"ref19","article-title":"Nansy++: Unified voice synthesis with neural analysis and synthesis","volume-title":"Proc. 11th Int. Conf. Learn. Representations","author":"Choi","year":"2023"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"ref21","article-title":"High fidelity neural audio compression","author":"Dfossez","year":"2024","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref22","article-title":"Naturalspeech 3: Zero-shot speech synthesis with factorized codec and diffusion models","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","author":"Ju","year":"2024"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.5334\/labphon.237"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1121\/1.3455847"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2011.5947385"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178812"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO63174.2024.10715399"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095630"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1121\/1.405559"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3168"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1016\/S0016-0032(39)90816-1"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1121\/1.1916020"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1121\/1.1906681"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1121\/1.1907169"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1121\/1.1909541"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1121\/1.1913427"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1121\/1.386780"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6393(82)90017-6"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-94-009-2037-8_7"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6855097"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2016.06.004"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1604"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2015.02.003"},{"key":"ref44","article-title":"Towards an articulatory-driven neural vocoder for speech synthesis","volume-title":"Proc. ISSP 2020-12th Int. Seminar Speech Prod.","author":"Georges","year":"2020"},{"key":"ref45","first-page":"6309","article-title":"Neural discrete representation learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"30","author":"Van Den Oord","year":"2017"},{"key":"ref46","article-title":"SpeechTokenizer: Unified speech tokenizer for speech large language models","volume-title":"Proc. 25th Int. Conf. Learn. Representations","author":"Zhang","year":"2024"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-475"},{"key":"ref48","first-page":"16624","article-title":"HierSpeech: Bridging the gap between text and speech by hierarchical variational inference using self-supervised representations for speech synthesis","volume":"35","author":"Lee","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"ref49","article-title":"Hierspeech : Bridging the gap between semantic and acoustic representation of speech by hierarchical variational inference for zero-shot speech synthesis","author":"Lee","year":"2023","journal-title":"arXiv:2311.12454"},{"key":"ref50","first-page":"1","article-title":"Quickvc: Any-to-many voice conversion using inverse short-time fourier transform for faster conversion","volume-title":"Proc. IEEE Autom. Speech Recognit. Understanding Workshop","author":"Guo","year":"2023"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10019"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096149"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2351"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2011-316"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446863"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461329"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446197"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1280"},{"key":"ref60","first-page":"17022","article-title":"HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","volume":"33","author":"Kong","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11671"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1584"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref64","article-title":"CSTR VCTK corpus: English multi-speaker corpus for CSTR voice cloning toolkit","author":"Veaux","year":"2017","journal-title":"CSTR"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6393(94)90055-8"},{"key":"ref66","article-title":"Mocha: Multichannel articulatory database","author":"Wrench","year":"1999"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1121\/1.4987629"},{"key":"ref68","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2023"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-439"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2826"},{"key":"ref71","article-title":"KSS dataset: Korean single speaker speech dataset","author":"Park","year":"2018"},{"key":"ref72","article-title":"JVS corpus: Free Japanese multi-speaker voice corpus","volume-title":"arXiv:1908.06248","author":"Takamichi","year":"2019"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-755"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096626"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2017-950"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2019.101027"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095191"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1121\/1.1869752"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1007\/s00422-015-0664-4"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10604"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2010.03.002"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1121\/1.4931827"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1080\/0269920031000071451"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11233"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096401"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446062"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.5555\/1953048.2078195"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6855102"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"}],"container-title":["IEEE Journal of Selected Topics in Signal Processing"],"original-title":[],"link":[{"URL":"https:\/\/ieeexplore.ieee.org\/ielam\/4200690\/10874827\/10759573-aam.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/4200690\/10874827\/10759573.pdf?arnumber=10759573","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,6]],"date-time":"2025-02-06T18:38:01Z","timestamp":1738867081000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10759573\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12]]},"references-count":91,"journal-issue":{"issue":"8"},"URL":"https:\/\/doi.org\/10.1109\/jstsp.2024.3497655","relation":{},"ISSN":["1932-4553","1941-0484"],"issn-type":[{"value":"1932-4553","type":"print"},{"value":"1941-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12]]}}}