{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T07:08:26Z","timestamp":1775200106740,"version":"3.50.1"},"reference-count":50,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100000179","name":"Office of the Director","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000179","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100011039","name":"Intelligence Advanced Research Projects Activity","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100011039","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434737","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["Scalable Controllable Accented TTS"],"prefix":"10.1109","author":[{"given":"Henry Li","family":"Xinyuan","sequence":"first","affiliation":[{"name":"Johns Hopkins University,Human Language Technology Center of Excellence,Baltimore,United States"}]},{"given":"Zexin","family":"Cai","sequence":"additional","affiliation":[{"name":"Johns Hopkins University,Human Language Technology Center of Excellence,Baltimore,United States"}]},{"given":"Ashi","family":"Garg","sequence":"additional","affiliation":[{"name":"Johns Hopkins University,Human Language Technology Center of Excellence,Baltimore,United States"}]},{"given":"Kevin","family":"Duh","sequence":"additional","affiliation":[{"name":"Johns Hopkins University,Human Language Technology Center of Excellence,Baltimore,United States"}]},{"given":"Leibny Paola","family":"Garc\u00eda-Perera","sequence":"additional","affiliation":[{"name":"Johns Hopkins University,Human Language Technology Center of Excellence,Baltimore,United States"}]},{"given":"Sanjeev","family":"Khudanpur","sequence":"additional","affiliation":[{"name":"Johns Hopkins University,Human Language Technology Center of Excellence,Baltimore,United States"}]},{"given":"Nicholas","family":"Andrews","sequence":"additional","affiliation":[{"name":"Johns Hopkins University,Human Language Technology Center of Excellence,Baltimore,United States"}]},{"given":"Matthew","family":"Wiesner","sequence":"additional","affiliation":[{"name":"Johns Hopkins University,Human Language Technology Center of Excellence,Baltimore,United States"}]}],"member":"263","reference":[{"key":"ref1","article-title":"CosyVoice: A Scalable Multilingual Zero-Shot Text-to-Speech Synthesizer Based on Supervised Semantic Tokens","author":"Du","year":"2024","journal-title":"arXiv preprint arXiv:2407.05407"},{"key":"ref2","article-title":"MaskGCT: Zero-Shot Text-to-Speech with Masked Generative Codec Transformer","volume-title":"International Conference on Learning Representations","author":"Wang"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2016"},{"key":"ref4","doi-asserted-by":"crossref","DOI":"10.1109\/ASRU65441.2025.11434735","article-title":"Genvc: Self-supervised zero-shot voice conversion","author":"Cai","year":"2025"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1192"},{"key":"ref6","article-title":"VALL-E 2: Neural Codec Language Models are Human Parity Zero-Shot Text to Speech Synthesizers","author":"Chen","year":"2024","journal-title":"arXiv preprint arXiv:2406.05370"},{"key":"ref7","article-title":"NANSY++: Unified voice synthesis with neural analysis and synthesis","volume-title":"The Eleventh International Conference on Learning Representations","author":"Choi"},{"key":"ref8","first-page":"223","article-title":"The cmu arctic speech databases","volume-title":"5th ISCA Workshop on Speech Synthesis (SSW 5)","author":"Kominek"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2018-1110"},{"key":"ref10","article-title":"Superseded - CSTR VCTK Corpus: English Multi-speaker Corpus for CSTR Voice Cloning Toolkit","author":"Veaux","year":"2016"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888195"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888332"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TENCON61640.2024.10902878"},{"key":"ref14","article-title":"Accented text-to-speech synthesis with a conditional variational autoencoder","volume-title":"Proc. of IEEE Tencon"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10023072"},{"key":"ref16","article-title":"Dart: Disentanglement of accent and speaker representation in multispeaker text-to-speech","volume-title":"Audio Imagination: NeurIPS 2024 Workshop","author":"Melechovsky"},{"key":"ref17","first-page":"4218","article-title":"Common Voice: A Massively-Multilingual Speech Corpus","volume-title":"Proceedings of the Twelfth Language Resources and Evaluation Conference","author":"Ardila"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2419"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.286"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-419"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/taslp.2024.3363414"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-40498-6_28"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2014-318"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2270"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3378110"},{"key":"ref26","doi-asserted-by":"crossref","DOI":"10.1109\/SLT61566.2024.10832155","article-title":"Cross-dialect text-to-speech in pitch-accent language incorporating multi-dialect phoneme-level bert","author":"Yamauchi","year":"2024"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2023.3270079"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2330"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP57327.2022.10037914"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095773"},{"key":"ref31","article-title":"Accentvits:accent transfer for end-to-end tts","author":"Ma","year":"2023"},{"key":"ref32","first-page":"5180","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","volume-title":"Proceedings of the 35th International Conference on Machine Learning","volume":"80","author":"Wang"},{"key":"ref33","article-title":"Multiscale accent modeling and disentangling for multi-speaker multiaccent text-to-speech synthesis","author":"Zhou","year":"2025"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2281"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-70"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-329"},{"key":"ref37","article-title":"Discrete variational autoencoders","volume-title":"International Conference on Learning Representations","author":"Rolfe"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref39","first-page":"17022","article-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Kong","year":"2020"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1229"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.21437\/SPSC.2024-11"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1584"},{"key":"ref44","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2025-2283","article-title":"Pairwise evaluation of accent similarity in speech synthesis","author":"Zhong","year":"2025"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSPW62465.2024.10669905"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref47","article-title":"Speech accent archive","author":"Weinberger"},{"key":"ref48","article-title":"Nist 2022 language recognition evaluation plan","volume-title":"Language Recognition Evaluation, Tech. Rep.","author":"Lee","year":"2022"},{"key":"ref49","article-title":"Good practices for evaluation of synthesized speech","author":"Cooper","year":"2025"},{"key":"ref50","first-page":"28 492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proceedings of the 40th International Conference on Machine Learning, ser. Proceedings of Machine Learning Research","volume":"202","author":"Radford"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434737.pdf?arnumber=11434737","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:59:03Z","timestamp":1775192343000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434737\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":50,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434737","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}