{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T00:52:09Z","timestamp":1740099129761,"version":"3.37.3"},"publisher-location":"Cham","reference-count":32,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319995786"},{"type":"electronic","value":"9783319995793"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-319-99579-3_69","type":"book-chapter","created":{"date-parts":[[2018,8,24]],"date-time":"2018-08-24T07:36:09Z","timestamp":1535096169000},"page":"676-686","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["First Steps Towards Hybrid Speech Synthesis in Czech TTS System ARTIC"],"prefix":"10.1007","author":[{"given":"Daniel","family":"Tihelka","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zden\u011bk","family":"Hanzl\u00ed\u010dek","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mark\u00e9ta","family":"J\u016fzov\u00e1","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jind\u0159ich","family":"Matou\u0161ek","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2018,8,25]]},"reference":[{"key":"69_CR1","unstructured":"Black, A.W., et al.: CMU blizzard 2007: a hybrid acoustic unit selection system from statistically predicted parameters. In: Blizzard Challenge (2007)"},{"issue":"9\/10","key":"69_CR2","first-page":"341","volume":"5","author":"P Boersma","year":"2001","unstructured":"Boersma, P., van Heuven, V.: Praat, a system for doing phonetics by computer. Glot Int. 5(9\/10), 341\u2013347 (2001)","journal-title":"Glot Int."},{"issue":"4","key":"69_CR3","doi-asserted-by":"publisher","first-page":"317","DOI":"10.1016\/j.specom.2007.01.014","volume":"49","author":"R Clark","year":"2007","unstructured":"Clark, R., Richmond, K., King, S.: Multisyn: open-domain unit selection for the festival speech synthesis system. Speech Commun. 49(4), 317\u2013330 (2007)","journal-title":"Speech Commun."},{"key":"69_CR4","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"291","DOI":"10.1007\/978-3-642-15760-8_37","volume-title":"Text, Speech and Dialogue","author":"Z Hanzl\u00ed\u010dek","year":"2010","unstructured":"Hanzl\u00ed\u010dek, Z.: Czech HMM-based speech synthesis. In: Sojka, P., Hor\u00e1k, A., Kope\u010dek, I., Pala, K. (eds.) TSD 2010. LNCS (LNAI), vol. 6231, pp. 291\u2013298. Springer, Heidelberg (2010). https:\/\/doi.org\/10.1007\/978-3-642-15760-8_37"},{"key":"69_CR5","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"353","DOI":"10.1007\/978-3-319-64206-2_40","volume-title":"Text, Speech, and Dialogue","author":"Z Hanzl\u00ed\u010dek","year":"2017","unstructured":"Hanzl\u00ed\u010dek, Z.: Optimal number of states in HMM-based speech synthesis. In: Ek\u0161tein, K., Matou\u0161ek, V. (eds.) TSD 2017. LNCS (LNAI), vol. 10415, pp. 353\u2013361. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-64206-2_40"},{"key":"69_CR6","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"249","DOI":"10.1007\/978-3-642-40585-3_32","volume-title":"Text, Speech, and Dialogue","author":"Z Hanzl\u00ed\u010dek","year":"2013","unstructured":"Hanzl\u00ed\u010dek, Z., Matou\u0161ek, J., Tihelka, D.: Experiments on reducing footprint of unit selection TTS system. In: Habernal, I., Matou\u0161ek, V. (eds.) TSD 2013. LNCS (LNAI), vol. 8082, pp. 249\u2013256. Springer, Heidelberg (2013). https:\/\/doi.org\/10.1007\/978-3-642-40585-3_32"},{"key":"69_CR7","unstructured":"Hirai, T., Yamagishi, J., Tenpaku, S.: Utilization of an HMM-based feature generation module in 5 ms segment concatenative speech synthesis. In: Proceedings of SSW6, pp. 81\u201384. ISCA, Bonn (2007)"},{"key":"69_CR8","unstructured":"ITU Recommendation BS.1534-2: Method for the subjective assessment of intermediate quality level of coding systems. Technical report, International Telecommunication Union (2014)"},{"key":"69_CR9","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"317","DOI":"10.1007\/978-3-319-64206-2_36","volume-title":"Text, Speech, and Dialogue","author":"M J\u016fzov\u00e1","year":"2017","unstructured":"J\u016fzov\u00e1, M., Tihelka, D., Skarnitzl, R.: Last syllable unit penalization in unit selection TTS. In: Ek\u0161tein, K., Matou\u0161ek, V. (eds.) TSD 2017. LNCS (LNAI), vol. 10415, pp. 317\u2013325. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-64206-2_36"},{"key":"69_CR10","doi-asserted-by":"crossref","unstructured":"J\u016fzov\u00e1, M., Tihelka, D., Vol\u00edn, J.: F0 post-stress rise trends consideration in unit selection TTS. In: TSD 2018. LNCS. Springer (2018, to appear)","DOI":"10.1007\/978-3-030-00794-2_39"},{"key":"69_CR11","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1016\/S0167-6393(98)00085-5","volume":"27","author":"H Kawahara","year":"1999","unstructured":"Kawahara, H., Masuda-Katsuse, I., de Cheveigne, A.: Restructuring speech representations using a pitch-adaptive time-frequency smoothing and an instantaneous-frequency-based F0 extraction: possible role of a repetitive structure in sounds. Speech Commun. 27, 187\u2013207 (1999)","journal-title":"Speech Commun."},{"key":"69_CR12","doi-asserted-by":"publisher","first-page":"552","DOI":"10.1016\/j.specom.2011.01.008","volume":"53","author":"M Leg\u00e1t","year":"2011","unstructured":"Leg\u00e1t, M., Matou\u0161ek, J., Tihelka, D.: On the detection of pitch marks using a robust multi-phase algorithm. Speech Commun. 53, 552\u2013566 (2011)","journal-title":"Speech Commun."},{"key":"69_CR13","doi-asserted-by":"crossref","unstructured":"Ling, Z.H., Wang, R.H.: HMM-based unit selection using frame sized speech segments. In: Proceedings of Interspeech 2006 - ICSLP, pp. 2034\u20132037. ISCA, Pittsburgh (2006)","DOI":"10.21437\/Interspeech.2006-398"},{"key":"69_CR14","doi-asserted-by":"crossref","unstructured":"Ling, Z.H., Wang, R.H.: HMM-based hierarchical unit selection combining Kullback-Leibler divergence with likelihood criterion. In: Proceedings of ICASSP, pp. 1245\u20131248. Honolulu, Hawaii (2007)","DOI":"10.1109\/ICASSP.2007.367302"},{"key":"69_CR15","unstructured":"Matou\u0161ek, J., Leg\u00e1t, M.: Is unit selection aware of audible artifacts? In: Proceedings of SSW8, pp. 267\u2013271. ISCA, Barcelona (2013)"},{"key":"69_CR16","doi-asserted-by":"crossref","unstructured":"Merritt, T., Clark, R.A.J., Wu, Z., Yamagishi, J., King, S.: Deep neural network-guided unit selection synthesis. In: Proceedings of ICASSP, pp. 5145\u20135149. IEEE, Shanghai (2016)","DOI":"10.1109\/ICASSP.2016.7472658"},{"key":"69_CR17","unstructured":"van den Oord, A., et al.: WaveNet: a generative model for raw audio. CoRR abs\/1609.03499 (2016)"},{"key":"69_CR18","doi-asserted-by":"crossref","unstructured":"Pollet, V., Breen, A.: Synthesis by generation and concatenation of multiform segments. In: Proceedings of Interspeech 2008, pp. 1825\u20131828. ISCA, Brisbane (2008)","DOI":"10.21437\/Interspeech.2008-175"},{"issue":"2","key":"69_CR19","doi-asserted-by":"publisher","first-page":"280","DOI":"10.1109\/TASL.2012.2221460","volume":"21","author":"Y Qian","year":"2013","unstructured":"Qian, Y., Soong, F.K., Yan, Z.J.: A unified trajectory tiling approach to high quality speech rendering. IEEE Trans. Audio Speech Lang. Process. 21(2), 280\u2013290 (2013)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"69_CR20","doi-asserted-by":"crossref","unstructured":"Sil\u00e9n, H., Helander, E., Nurminen, J., Koppinen, K., Gabbouj, M.: Using robust Viterbi algorithm and HMM-modeling in unit selection TTS to replace units of poor quality. In: Proceedings of Interspeech 2010, pp. 166\u2013169. ISCA, Makuhari (2010)","DOI":"10.21437\/Interspeech.2010-76"},{"key":"69_CR21","doi-asserted-by":"crossref","unstructured":"Sorin, A., Shechtman, S., Pollet, V.: Refined inter-segment joining in multi-form speech synthesis. In: Proceedings of Interspeech 2014, Singapore, pp. 790\u2013794 (2014)","DOI":"10.21437\/Interspeech.2014-182"},{"key":"69_CR22","doi-asserted-by":"crossref","unstructured":"Taylor, P.: The target cost formulation in unit selection speech synthesis. In: Proceedings of Interspeech 2006 - ICSLP, vol. 1, pp. 2038\u20132041. ISCA, Pittsburgh (2006)","DOI":"10.21437\/Interspeech.2006-399"},{"key":"69_CR23","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511816338","volume-title":"Text-to-Speech Synthesis","author":"P Taylor","year":"2009","unstructured":"Taylor, P.: Text-to-Speech Synthesis, 1st edn. Cambridge University Press, New York (2009)","edition":"1"},{"key":"69_CR24","doi-asserted-by":"crossref","unstructured":"Tihelka, D.: Symbolic prosody driven unit selection for highly natural synthetic speech. In: Proceedings of Interspeech 2005 - Eurospeech, pp. 2525\u20132528. ISCA, Lisbon (2005)","DOI":"10.21437\/Interspeech.2005-786"},{"key":"69_CR25","doi-asserted-by":"crossref","unstructured":"Tihelka, D., Hanzl\u00ed\u010dek, Z., J\u016fzov\u00e1, M., V\u00edt, J., Matou\u0161ek, J., Gr\u016fber, M.: Current state of text-to-speech system ARTIC: a decade of research on the field of speech technologies. In: TSD 2018. LNCS. Springer (2018, to appear)","DOI":"10.1007\/978-3-030-00794-2_40"},{"key":"69_CR26","doi-asserted-by":"crossref","unstructured":"Tihelka, D., Matou\u0161ek, J.: Unit selection and its relation to symbolic prosody: a new approach. In: Proceedings of Interspeech 2006 - ICSLP, vol. 1, pp. 2042\u20132045. ISCA, Pittsburgh (2006)","DOI":"10.21437\/Interspeech.2006-400"},{"key":"69_CR27","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"457","DOI":"10.1007\/978-3-319-10816-2_55","volume-title":"Text, Speech and Dialogue","author":"D Tihelka","year":"2014","unstructured":"Tihelka, D., Matou\u0161ek, J., Hanzl\u00ed\u010dek, Z.: Modelling F$$_0$$0 dynamics in unit selection based speech synthesis. In: Sojka, P., Hor\u00e1k, A., Kope\u010dek, I., Pala, K. (eds.) TSD 2014. LNCS (LNAI), vol. 8655, pp. 457\u2013464. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10816-2_55"},{"key":"69_CR28","unstructured":"Tihelka, D., Stanislav, P.: ARTIC for assistive technologies: transformation to resource-limited hardware. In: Proceedings of WCECS 2011, pp. 581\u2013584. IANG, San Francisco (2011)"},{"issue":"5","key":"69_CR29","doi-asserted-by":"publisher","first-page":"1278","DOI":"10.1109\/TASL.2010.2089679","volume":"19","author":"S Tiomkin","year":"2011","unstructured":"Tiomkin, S., Malah, D., Shechtman, S., Kons, Z.: A hybrid text-to-speech system that combines concatenative and statistical synthesis units. IEEE Trans. Audio Speech Lang. Process. 19(5), 1278\u20131288 (2011)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"69_CR30","doi-asserted-by":"crossref","unstructured":"Toda, T., Tokuda, K.: Speech parameter generation algorithm considering global variance for HMM-based speech synthesis. In: Proceedings of Interspeech 2005, Lisbon, Portugal, pp. 2801\u20132804 (2005)","DOI":"10.21437\/Interspeech.2005-617"},{"key":"69_CR31","doi-asserted-by":"crossref","unstructured":"Yan, Z.J., Qian, Y., Soong, F.K.: Rich-context unit selection (RUS) approach to high quality TTS. In: Proceedings of ICASSP 2010, Dallas, Texas, USA, pp. 4798\u20134801 (2010)","DOI":"10.1109\/ICASSP.2010.5495150"},{"issue":"11","key":"69_CR32","doi-asserted-by":"publisher","first-page":"1039","DOI":"10.1016\/j.specom.2009.04.004","volume":"51","author":"H Zen","year":"2009","unstructured":"Zen, H., Tokuda, K., Black, A.W.: Statistical parametric speech synthesis. Speech Commun. 51(11), 1039\u20131064 (2009)","journal-title":"Speech Commun."}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-99579-3_69","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,30]],"date-time":"2022-08-30T05:03:54Z","timestamp":1661835834000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-99579-3_69"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783319995786","9783319995793"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-99579-3_69","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2018]]}}}