{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,20]],"date-time":"2025-11-20T05:57:11Z","timestamp":1763618231368,"version":"3.45.0"},"publisher-location":"Cham","reference-count":26,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031705656"},{"type":"electronic","value":"9783031705663"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-70566-3_5","type":"book-chapter","created":{"date-parts":[[2024,8,31]],"date-time":"2024-08-31T18:29:51Z","timestamp":1725128991000},"page":"46-57","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Zero-Shot vs. Few-Shot Multi-speaker TTS Using Pre-trained Czech SpeechT5 Model"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3889-8069","authenticated-orcid":false,"given":"Jan","family":"Lehe\u010dka","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4001-9289","authenticated-orcid":false,"given":"Zden\u011bk","family":"Hanzl\u00ed\u010dek","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7408-7730","authenticated-orcid":false,"given":"Jind\u0159ich","family":"Matou\u0161ek","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3149-2330","authenticated-orcid":false,"given":"Daniel","family":"Tihelka","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,8,27]]},"reference":[{"unstructured":"Method for the subjective assessment of intermediate quality level of coding systems. ITU Recommendation ITU-R BS.1534-2. Tech. rep., International Telecomminication Union (2014)","key":"5_CR1"},{"doi-asserted-by":"crossref","unstructured":"Ao, J., et al.: SpeechT5: Unified-modal encoder-decoder pre-training for spoken language processing. In: Muresan, S., Nakov, P., Villavicencio, A. (eds.) Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 5723\u20135738. Association for Computational Linguistics, Dublin, Ireland (2022)","key":"5_CR2","DOI":"10.18653\/v1\/2022.acl-long.393"},{"unstructured":"Ardila, R., et al.: Common voice: a massively-multilingual speech corpus. In: Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020), pp. 4211\u20134215 (2020)","key":"5_CR3"},{"unstructured":"Babu, A., et al.: XLS-R: Self-supervised cross-lingual speech representation learning at scale (2021). arXiv abs\/2111.09296","key":"5_CR4"},{"unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: A framework for self-supervised learning of speech representations. Adv. Neural Inf. Process. Syst. 33, 12449\u201312460 (2020)","key":"5_CR5"},{"unstructured":"Betker, J.: TorToiSe text-to-speech (2022). https:\/\/github.com\/neonbjb\/tortoise-tts","key":"5_CR6"},{"unstructured":"Casanova, E., Weber, J., Shulby, C.D., Junior, A.C., G\u00f6lge, E., Ponti, M.A.: YourTTS: towards zero-shot multi-speaker TTS and zero-shot voice conversion for everyone. In: International Conference on Machine Learning, pp. 2709\u20132720. PMLR (2022)","key":"5_CR7"},{"doi-asserted-by":"publisher","unstructured":"Cooper, E., Huang, W.C., Tsao, Y., Wang, H.M., Toda, T., Yamagishi, J.: The VoiceMOS challenge 2023: Zero-shot subjective speech quality prediction for multiple domains. In: 2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU) (2023). https:\/\/doi.org\/10.1109\/ASRU57964.2023.10389763","key":"5_CR8","DOI":"10.1109\/ASRU57964.2023.10389763"},{"unstructured":"Gr\u016fber, M., Ch\u00fdlek, A., Matou\u0161ek, J.: Framework for conducting tasks requiring human assessment. In: Proceedings of Interspeech 2019, pp. 4626\u20134627 (2019)","key":"5_CR9"},{"unstructured":"Jiang, Z., et al.: Mega-TTS 2: Boosting prompting mechanisms for zero-shot speech synthesis (2024)","key":"5_CR10"},{"unstructured":"Ju, Z., et al.: NaturalSpeech 3: Zero-shot speech synthesis with factorized codec and diffusion models (2024)","key":"5_CR11"},{"doi-asserted-by":"publisher","unstructured":"Kim, C., Stern, R.M.: Robust signal-to-noise ratio estimation based on waveform amplitude distribution analysis. In: Proceedings of Interspeech 2008, pp. 2598\u20132601 (2008) https:\/\/doi.org\/10.21437\/Interspeech.2008-644","key":"5_CR12","DOI":"10.21437\/Interspeech.2008-644"},{"unstructured":"Le, M., et al.: Voicebox: Text-guided multilingual universal speech generation at scale (2023)","key":"5_CR13"},{"unstructured":"Lee, S.H., Choi, H.Y., Kim, S.B., Lee, S.W.: HierSpeech++: bridging the gap between semantic and acoustic representation of speech by hierarchical variational inference for zero-shot speech synthesis (2023)","key":"5_CR14"},{"doi-asserted-by":"publisher","unstructured":"Lehe\u010dka, J., \u0160vec, J., Pra\u017e\u00e1k, A., Psutka, J.V.: Exploring capabilities of monolingual audio transformers using large datasets in automatic speech recognition of Czech. In: Proceedings of Interspeech 2022, pp. 1831\u20131835 (2022) https:\/\/doi.org\/10.21437\/Interspeech.2022-10439","key":"5_CR15","DOI":"10.21437\/Interspeech.2022-10439"},{"unstructured":"Li, Y.A., Han, C., Raghavan, V., Mischler, G., Mesgarani, N.: StyleTTS 2: towards human-level text-to-speech through style diffusion and adversarial training with large speech language models. In: Oh, A., Neumann, T., Globerson, A., Saenko, K., Hardt, M., Levine, S. (eds.) Advances in Neural Information Processing Systems. vol.\u00a036, pp. 19594\u201319621. Curran Associates, Inc. (2023)","key":"5_CR16"},{"issue":"140","key":"5_CR17","first-page":"1","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(140), 1\u201367 (2020)","journal-title":"J. Mach. Learn. Res."},{"unstructured":"Shen, K., et al.: NaturalSpeech 2: Latent diffusion models are natural and zero-shot speech and singing synthesizers (2023)","key":"5_CR18"},{"doi-asserted-by":"crossref","unstructured":"Snyder, D., Garcia-Romero, D., McCree, A., Sell, G., Povey, D., Khudanpur, S.: Spoken language recognition using x-vectors. In: Odyssey. vol.\u00a02018, pp. 105\u2013111 (2018)","key":"5_CR19","DOI":"10.21437\/Odyssey.2018-15"},{"key":"5_CR20","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"86","DOI":"10.1007\/978-3-030-83527-9_7","volume-title":"Text, Speech, and Dialogue","author":"J \u0160vec","year":"2021","unstructured":"\u0160vec, J., Lehe\u010dka, J., \u0160m\u00eddl, L., Ircing, P.: Transformer-based automatic punctuation prediction and word casing reconstruction of the ASR output. In: Ek\u0161tein, K., P\u00e1rtl, F., Konop\u00edk, M. (eds.) TSD 2021. LNCS (LNAI), vol. 12848, pp. 86\u201394. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-83527-9_7"},{"unstructured":"Tihelka, D., \u0158ez\u00e1\u010dkov\u00e1, M., Gr\u016fber, M., Hanzl\u00ed\u010dek, Z., V\u00edt, J., Matou\u0161ek, J.: Save your voice: Voice banking and TTS for anyone. In: Proceedings of Interspeech 2021, pp. 4855\u20134856 (2021)","key":"5_CR21"},{"unstructured":"Vaswani, A., et al.: Attention is all you need. In: Guyon, I., Luxburg, U.V., Bengio, S., Wallach, H., Fergus, R., Vishwanathan, S., Garnett, R. (eds.) Advances in Neural Information Processing Systems. vol.\u00a030. Curran Associates, Inc. (2017)","key":"5_CR22"},{"doi-asserted-by":"crossref","unstructured":"Wang, C., et al.: VoxPopuli: a large-scale multilingual speech corpus for representation learning, semi-supervised learning and interpretation. In: ACL 2021-59th Annual Meeting of the Association for Computational Linguistics (2021)","key":"5_CR23","DOI":"10.18653\/v1\/2021.acl-long.80"},{"unstructured":"Wang, C., et al.: Neural codec language models are zero-shot text to speech synthesizers (2023)","key":"5_CR24"},{"unstructured":"Yang, D., et al.: UniAudio: An audio foundation model toward universal audio generation (2023)","key":"5_CR25"},{"doi-asserted-by":"crossref","unstructured":"Zen, H., et al.: Libritts: a corpus derived from libriSpeech for text-to-speech. In: Proceedings of Interspeech 2019 (2019)","key":"5_CR26","DOI":"10.21437\/Interspeech.2019-2441"}],"container-title":["Lecture Notes in Computer Science","Text, Speech, and Dialogue"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-70566-3_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,20]],"date-time":"2025-11-20T05:15:02Z","timestamp":1763615702000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-70566-3_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031705656","9783031705663"],"references-count":26,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-70566-3_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"27 August 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"TSD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Text, Speech, and Dialogue","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Brno","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Czech Republic","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 September 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"tsd2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.tsdconference.org\/tsd2024\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}