{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,12]],"date-time":"2026-06-12T16:58:59Z","timestamp":1781283539467,"version":"3.54.1"},"reference-count":70,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"5","license":[{"start":{"date-parts":[[2010,7,1]],"date-time":"2010-07-01T00:00:00Z","timestamp":1277942400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2010,7]]},"DOI":"10.1109\/tasl.2010.2045237","type":"journal-article","created":{"date-parts":[[2010,3,16]],"date-time":"2010-03-16T19:01:37Z","timestamp":1268766097000},"page":"984-1004","source":"Crossref","is-referenced-by-count":50,"title":["Thousands of Voices for HMM-Based Speech Synthesis\u2013Analysis and Application of TTS Systems Built on Various ASR Corpora"],"prefix":"10.1109","volume":"18","author":[{"given":"Junichi","family":"Yamagishi","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bela","family":"Usabaev","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Simon","family":"King","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Oliver","family":"Watts","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"John","family":"Dines","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jilei","family":"Tian","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yong","family":"Guan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rile","family":"Hu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Keiichiro","family":"Oura","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yi-Jian","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Keiichi","family":"Tokuda","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Reima","family":"Karhila","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mikko","family":"Kurimo","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref70","article-title":"revisiting the security of speaker verification systems against imposture using synthetic speech","author":"de leon","year":"2010","journal-title":"Proc ICASSP96"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2008.2006647"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1006\/csla.1998.0043"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1093\/ietisy\/e90-d.5.825"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1093\/ietisy\/e90-1.1.325"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-6393(98)00085-5"},{"key":"ref30","doi-asserted-by":"crossref","DOI":"10.21437\/Blizzard.2008-1","article-title":"the blizzard challenge 2008","author":"karaiskos","year":"2008","journal-title":"Proc Blizzard Challenge 2008"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICSLP.1996.607807"},{"key":"ref36","first-page":"187","article-title":"evaluation of flat start labeling for phoneme based mandarin hts system","author":"guan","year":"2009","journal-title":"Proc ORIENTAL-COCOSDA-09"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1250\/ast.21.79"},{"key":"ref34","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1111\/j.2517-6161.1977.tb01600.x","article-title":"maximum likelihood from incomplete data via the em algorithm","volume":"39","author":"dempster","year":"1977","journal-title":"J R Statist Soc Series B"},{"key":"ref60","first-page":"43","article-title":"reformulating the hmm as a trajectory model","volume":"104","author":"tokuda","year":"2004","journal-title":"IEICE Tech Rep Natural Lang Understanding Models of Commun"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1111\/j.1467-9280.1990.tb00079.x"},{"key":"ref61","author":"erro","year":"2008","journal-title":"Intra-lingual and cross-lingual voice conversion using harmonic plus stochastic models"},{"key":"ref63","article-title":"analysis of unsupervised and noise-robust speaker-adaptive hmm-based speech synthesis systems toward a unified asr and tts framework","author":"yamagishi","year":"2009","journal-title":"Proc Blizzard Challenge Workshop"},{"key":"ref28","doi-asserted-by":"crossref","DOI":"10.21437\/Blizzard.2008-7","article-title":"the hts-2008 system: yet another evaluation of the speaker-adaptive hmm-based speech synthesis system in the 2008 blizzard challenge","author":"yamagishi","year":"2008","journal-title":"Proc Blizzard Challenge 2008"},{"key":"ref64","first-page":"1043","article-title":"Mel-generalized cepstral analysis&#x2014;A unified approach to speech spectral estimation","author":"tokuda","year":"1994","journal-title":"Proc ICSLP-94"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.3115\/1075812.1075885"},{"key":"ref65","doi-asserted-by":"crossref","first-page":"15","DOI":"10.1109\/ASRU.2001.1034578","article-title":"adaptive training for robust asr","author":"gales","year":"2001","journal-title":"Proc IEEE Workshop Autom Speech Recognition Understanding"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-6393(01)00059-0"},{"key":"ref29","article-title":"the blizzard challenge 2007","author":"fraser","year":"2007","journal-title":"Proc BLZ3-2007 (in Proc SSW6)"},{"key":"ref67","author":"creer","year":"2010","journal-title":"Computer Synthesized Speech Technologies Tools for Aiding Impairment"},{"key":"ref68","doi-asserted-by":"crossref","first-page":"1223","DOI":"10.21437\/Eurospeech.1999-286","article-title":"on the security of hmm-based speaker verification systems against imposture using synthetic speech","author":"masuko","year":"1999","journal-title":"Proc EuroSpeech-99"},{"key":"ref69","first-page":"302","article-title":"imposture using synthetic speech against speaker verification based on spectrum and pitch","author":"masuko","year":"2000","journal-title":"Proc ICSLP-00"},{"key":"ref2","first-page":"2374","article-title":"simultaneous modeling of spectrum, pitch and duration in hmm-based speech synthesis","author":"yoshimura","year":"1999","journal-title":"Proc EuroSpeech-99"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2045237"},{"key":"ref20","year":"2008","journal-title":"Deliverable Report D2 1"},{"key":"ref22","first-page":"179","article-title":"ximera: a new tts from atr based on corpus-based technologies","author":"kawai","year":"2004","journal-title":"Proc ISCA 5th Speech Synth Workshop"},{"key":"ref21","article-title":"an efficient and unified approach of mandarin hts system","author":"guan","year":"2010","journal-title":"Proc ICASSP96"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TSA.2005.860774"},{"key":"ref23","first-page":"2688","article-title":"ximera: a concatenative speech synthesis system with large scale corpora","volume":"j89 d ii","author":"kawai","year":"2006","journal-title":"IEICE Trans Inf Syst"},{"key":"ref26","author":"tokuda","year":"2004","journal-title":"Text to Speech Synthesis New Paradigms and Advances"},{"key":"ref25","first-page":"115","article-title":"generacion de una voz sintetica en castellano basada en hsmm para la evaluacion albayzin 2008: conversion texto a voz","author":"barra-chicote","year":"2008","journal-title":"V Jornadas en Tecnologia del Habla"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1976.1162849"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1980.1171078"},{"key":"ref59","author":"tsakalidis","year":"2005","journal-title":"Cross-corpus normalization of diverse acoustic training data for robust HMM training"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2005.1415150"},{"key":"ref57","author":"cox","year":"2001","journal-title":"Multidimensional Scaling"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.13176\/11.112"},{"key":"ref55","first-page":"717","article-title":"analysis of speaking styles by two-dimensional visualization of aggregate of acoustic models","author":"shozakai","year":"2004","journal-title":"Proc ICSLP-04"},{"key":"ref54","article-title":"statistical analysis of the blizzard challenge 2007 listening test results","author":"clark","year":"2007","journal-title":"Proc BLZ3-2007 (in Proc SSW6)"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2009.4960451"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1996.541114"},{"key":"ref10","first-page":"298","article-title":"darpa resource management bench","author":"pallet","year":"1990","journal-title":"Proc Workshop Speech Natural Lang"},{"key":"ref11","first-page":"345","article-title":"globalphone: a multilingual speech and text database developed at karlsruhe university","author":"schultz","year":"2002","journal-title":"Proc ICSLP'02"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1093\/ietisy\/e90-d.5.816"},{"key":"ref12","first-page":"329","article-title":"SPEECON&#x2014;speech databases for consumer devices: Database specification and validation","author":"iskra","year":"2002","journal-title":"Proc LREC'02"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1561\/2000000004"},{"key":"ref14","first-page":"3261","article-title":"the design of the newspaper-based japanese large vocabulary continuous speech recognition corpus","author":"itou","year":"1998","journal-title":"Proc ICSLP-98"},{"key":"ref15","first-page":"31","author":"kubozono","year":"1995","journal-title":"The handbook of Japanese Linguistics"},{"key":"ref16","doi-asserted-by":"crossref","first-page":"823","DOI":"10.21437\/Eurospeech.1999-200","article-title":"synthesis of regional english using a keyword lexicon","volume":"2","author":"fitt","year":"1999","journal-title":"Proc EuroSpeech-99"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2007.01.014"},{"key":"ref18","year":"0"},{"key":"ref19","first-page":"223","article-title":"an hmm-based mandarin chinese text-to-speech system","author":"qian","year":"2006","journal-title":"Proc ISCSLP'06"},{"key":"ref4","article-title":"the hmm-based speech synthesis system (hts) version 2.0","author":"zen","year":"2007","journal-title":"Proc 6th ISCA Workshop Speech Synth (SSW-6)"},{"key":"ref3","author":"tokuda","year":"0","journal-title":"The HMM-Based Speech Synthesis System (HTS) Version 2 1"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2009.2016394"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2009.2016394"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.3115\/1075527.1075614"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2009.04.004"},{"key":"ref49","doi-asserted-by":"crossref","first-page":"528","DOI":"10.21437\/Interspeech.2009-192","article-title":"state mapping based method for cross-lingual speaker adaptation in hmm-based speech synthesis","author":"wu","year":"2009","journal-title":"Proc Interspeech-09"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1995.479278"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2010.2079315"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2010.2079315"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CHINSL.2008.ECP.14"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.3115\/1075812.1075824"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1992.225953"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6393(90)90021-Z"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1093\/ietisy\/e90-d.2.533"},{"key":"ref43","article-title":"the blizzard challenge 2009","author":"king","year":"2009","journal-title":"Proc Blizzard Challenge Workshop"}],"container-title":["IEEE Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx5\/10376\/5485195\/05431023.pdf?arnumber=5431023","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,19]],"date-time":"2025-02-19T13:59:01Z","timestamp":1739973541000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/5431023\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2010,7]]},"references-count":70,"journal-issue":{"issue":"5"},"URL":"https:\/\/doi.org\/10.1109\/tasl.2010.2045237","relation":{},"ISSN":["1558-7916","1558-7924"],"issn-type":[{"value":"1558-7916","type":"print"},{"value":"1558-7924","type":"electronic"}],"subject":[],"published":{"date-parts":[[2010,7]]}}}