{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,17]],"date-time":"2026-04-17T07:53:08Z","timestamp":1776412388861,"version":"3.51.2"},"reference-count":70,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"am","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"JSPS KAKENHI","award":["21H05054"],"award-info":[{"award-number":["21H05054"]}]},{"name":"JSPS KAKENHI","award":["22H03639"],"award-info":[{"award-number":["22H03639"]}]},{"name":"JSPS KAKENHI","award":["22J12040"],"award-info":[{"award-number":["22J12040"]}]},{"name":"NSF","award":["ACI-1445606"],"award-info":[{"award-number":["ACI-1445606"]}]},{"name":"Pittsburgh Supercomputing Center"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/taslp.2024.3369537","type":"journal-article","created":{"date-parts":[[2024,2,23]],"date-time":"2024-02-23T19:10:02Z","timestamp":1708715402000},"page":"1829-1844","source":"Crossref","is-referenced-by-count":10,"title":["Text-Inductive Graphone-Based Language Adaptation for Low-Resource Speech Synthesis"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6003-768X","authenticated-orcid":false,"given":"Takaaki","family":"Saeki","sequence":"first","affiliation":[{"name":"Graduate School of Information Science and Technology, The University of Tokyo, Bunkyo-ku, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6940-0115","authenticated-orcid":false,"given":"Soumi","family":"Maiti","sequence":"additional","affiliation":[{"name":"Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4585-159X","authenticated-orcid":false,"given":"Xinjian","family":"Li","sequence":"additional","affiliation":[{"name":"Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5970-8631","authenticated-orcid":false,"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[{"name":"Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0520-7847","authenticated-orcid":false,"given":"Shinnosuke","family":"Takamichi","sequence":"additional","affiliation":[{"name":"Graduate School of Information Science and Technology, The University of Tokyo, Bunkyo-ku, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0876-5617","authenticated-orcid":false,"given":"Hiroshi","family":"Saruwatari","sequence":"additional","affiliation":[{"name":"Graduate School of Information Science and Technology, The University of Tokyo, Bunkyo-ku, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/2792745.2792775"},{"key":"ref2","article-title":"A survey on neural speech synthesis","author":"Tan","year":"2021"},{"key":"ref3","article-title":"Ethnologue: Languages of the world","author":"Lewis","year":"2016"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10712"},{"key":"ref5","article-title":"Learning pronunciation from a foreign language in speech synthesis networks","author":"Lee","year":"2018"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2730"},{"key":"ref7","article-title":"Multilingual byte2speech models for scalable low-resource speech synthesis","author":"He","year":"2021"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1403"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095702"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1757"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/575"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.421"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.617"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2008.01.002"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2008.4518841"},{"key":"ref17","first-page":"46","article-title":"Hierarchical hybrid language models for open vocabulary continuous speech recognition using WFST","volume-title":"Proc. Annu. Conf. Int. Speech Commun. Assoc.","author":"Shaik","year":"2012"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1493"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2012.2187195"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-172"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.472"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683862"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-816"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682674"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-37"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1821"},{"key":"ref27","first-page":"748","article-title":"Bilbowa: Fast bilingual distributed representations without word alignments","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Gouws","year":"2015"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1.11640"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1077"},{"key":"ref30","first-page":"7059","article-title":"Cross-lingual language model pretraining","volume-title":"Proc. Int. Adv. Conf. Neural Inf. Process. Syst.","author":"Conneau","year":"2019"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.800"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.180"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.351"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2021.3073869"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3177"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414102"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096402"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-621"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016706"},{"key":"ref40","article-title":"FastSpeech 2: Fast and high-quality end-to-end text to speech","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ren","year":"2020"},{"key":"ref41","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kim"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2023-4"},{"key":"ref43","article-title":"Glottolog 4.5","author":"Hammarstrm","year":"2021"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.166"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.80"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.747"},{"key":"ref47","first-page":"4003","article-title":"Ccnet: Extracting high quality monolingual datasets from web crawl data","volume-title":"Proc. Lang. Resour. Eval. Conf.","author":"Wenzek","year":"2020"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1500"},{"key":"ref49","article-title":"Language resources and tools","year":"2019"},{"key":"ref50","article-title":"Language resources and tools","year":"2014"},{"key":"ref51","first-page":"741","article-title":"Low-resource multilingual and zero-shot multispeaker TTS","volume-title":"Proc. Amer. Chapter Assoc. Comput. Linguistics","author":"Lux","year":"2022"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00167"},{"key":"ref53","article-title":"The M-AILABS speech dataset","year":"2017"},{"key":"ref54","first-page":"17022","article-title":"HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"Proc. 34th Int. Conf. Neural Inf. Process. Syst.","author":"Kong","year":"2020"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref56","article-title":"CSTR VCTK corpus: English multi-speaker corpus for CSTR voice cloning toolkit","author":"Veaux","year":"2017"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-950"},{"key":"ref58","article-title":"SpeechBrain: A general-purpose speech toolkit","author":"Ravanelli","year":"2021"},{"key":"ref59","article-title":"ESPnet2-TTS: Extending the edge of tts research","author":"Hayashi","year":"2021"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1992.225953"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.3233\/IDA-2007-11508"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053512"},{"key":"ref65","article-title":"Robust speech recognition via large-scale weak supervision","author":"Radford","year":"2022"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10023141"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1017\/S1930297500002205"},{"key":"ref68","article-title":"Lancers"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.21437\/vcc_bc.2020-14"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1145\/3148148"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"https:\/\/ieeexplore.ieee.org\/ielam\/6570655\/10304349\/10444075-aam.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/10304349\/10444075.pdf?arnumber=10444075","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,26]],"date-time":"2024-03-26T12:55:05Z","timestamp":1711457705000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10444075\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":70,"URL":"https:\/\/doi.org\/10.1109\/taslp.2024.3369537","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}