{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T18:45:53Z","timestamp":1755801953005,"version":"3.44.0"},"publisher-location":"Cham","reference-count":27,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783032025470"},{"type":"electronic","value":"9783032025487"}],"license":[{"start":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:00Z","timestamp":1755820800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:00Z","timestamp":1755820800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-02548-7_14","type":"book-chapter","created":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T05:39:28Z","timestamp":1755754768000},"page":"158-169","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Evaluating Phoneme-Level Pretraining in\u00a0Czech Text-to-Speech Synthesis"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-8047-7303","authenticated-orcid":false,"given":"Luk\u00e1\u0161","family":"Vlada\u0159","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7408-7730","authenticated-orcid":false,"given":"Jind\u0159ich","family":"Matou\u0161ek","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3889-8069","authenticated-orcid":false,"given":"Jan","family":"Lehe\u010dka","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6194-7826","authenticated-orcid":false,"given":"Mark\u00e9ta","family":"\u0158ez\u00e1\u010dkov\u00e1","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"14_CR1","unstructured":"Beck, M., et al.: xLSTM: extended long short-term memory. In: International Conference on Neural Information Processing Systems, vol.\u00a037, pp. 107547\u2013107603 (2024)"},{"key":"14_CR2","doi-asserted-by":"publisher","unstructured":"Chen, S., et al.: WavLM: large-scale self-supervised pre-training for full stack speech processing. IEEE J. Sel. Top. Sign. Proces. (2022). https:\/\/doi.org\/10.1109\/JSTSP.2022.3188113","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"14_CR3","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 4171\u20134186 (2019). https:\/\/doi.org\/10.18653\/v1\/N19-1423","DOI":"10.18653\/v1\/N19-1423"},{"key":"14_CR4","doi-asserted-by":"crossref","unstructured":"Hanzl\u00ed\u010dek, Z., Matou\u0161ek, J., V\u00edt, J.: Using LSTM neural networks for cross-lingual phonetic speech segmentation with an iterative correction procedure. Comput. Intell. 40(2) (2024)","DOI":"10.1111\/coin.12602"},{"key":"14_CR5","doi-asserted-by":"publisher","unstructured":"Hayashi, T., Watanabe, S., Toda, T., Takeda, K., Toshniwal, S., Livescu, K.: Pre-trained text embeddings for enhanced text-to-speech synthesis. In: Interspeech, pp. 4430\u20134434 (2019). https:\/\/doi.org\/10.21437\/Interspeech.2019-3177","DOI":"10.21437\/Interspeech.2019-3177"},{"issue":"8","key":"14_CR6","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997). https:\/\/doi.org\/10.1162\/neco.1997.9.8.1735","journal-title":"Neural Comput."},{"key":"14_CR7","unstructured":"Ito, K., Johnson, L.: The LJ Speech Dataset (2017). https:\/\/keithito.com\/LJ-Speech-Dataset\/"},{"key":"14_CR8","doi-asserted-by":"publisher","unstructured":"Jia, Y., Chun), H.Z.B., Shen, J., Zhang, Y., Wu, Y.: PnG BERT: augmented BERT on phonemes and graphemes for neural TTS. In: Interspeech, pp. 151\u2013155 (2021). https:\/\/doi.org\/10.21437\/Interspeech.2021-1757","DOI":"10.21437\/Interspeech.2021-1757"},{"key":"14_CR9","doi-asserted-by":"publisher","unstructured":"Kaneko, T., Tanaka, K., Kameoka, H., Seki, S.: ISTFTNET: fast and lightweight mel-spectrogram vocoder incorporating inverse short-time Fourier transform. In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 6207\u20136211 (2022). https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9746713","DOI":"10.1109\/ICASSP43922.2022.9746713"},{"key":"14_CR10","unstructured":"Kim, J., Kim, S., Kong, J., Yoon, S.: Glow-TTS: a generative flow for text-to-speech via monotonic alignment search. In: International Conference on Neural Information Processing Systems. NIPS 2020 (2020)"},{"key":"14_CR11","unstructured":"Kim, J., Kong, J., Son, J.: Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech. In: Meila, M., Zhang, T. (eds.) International Conference on Machine Learning, vol.\u00a0139, pp. 5530\u20135540 (2021)"},{"issue":"4","key":"14_CR12","doi-asserted-by":"publisher","first-page":"307","DOI":"10.1561\/2200000056","volume":"12","author":"DP Kingma","year":"2019","unstructured":"Kingma, D.P., Welling, M.: An introduction to variational autoencoders. Found. Trends Mach. Learn. 12(4), 307\u2013392 (2019). https:\/\/doi.org\/10.1561\/2200000056","journal-title":"Found. Trends Mach. Learn."},{"key":"14_CR13","unstructured":"Kong, J., Kim, J., Bae, J.: HiFi-GAN: generative adversarial networks for efficient and high fidelity speech synthesis. In: International Conference on Neural Information Processing Systems. NIPS 2020 (2020)"},{"key":"14_CR14","doi-asserted-by":"publisher","unstructured":"Lan, Z., Chen, M., Goodman, S., Gimpel, K., Sharma, P., Soricut, R.: ALBERT: a lite BERT for self-supervised learning of language representations. In: International Conference on Learning Representations (2020). https:\/\/doi.org\/10.48550\/arXiv.1909.11942","DOI":"10.48550\/arXiv.1909.11942"},{"key":"14_CR15","doi-asserted-by":"publisher","unstructured":"Li, Y.A., Han, C., Jiang, X., Mesgarani, N.: Phoneme-level Bert for enhanced prosody of text-to-speech with grapheme predictions. In: IEEE International Conference on Acoustics, Speech and Signal Processing (2023). https:\/\/doi.org\/10.1109\/ICASSP49357.2023.10097074","DOI":"10.1109\/ICASSP49357.2023.10097074"},{"key":"14_CR16","unstructured":"Li, Y.A., Han, C., Mesgarani, N.: StyleTTS: a style-based generative model for natural and diverse text-to-speech synthesis (2022). http:\/\/arxiv.org\/abs\/2205.15439"},{"key":"14_CR17","unstructured":"Li, Y.A., Han, C., Raghavan, V.S., Mischler, G., Mesgarani, N.: StyleTTS 2: towards human-level text-to-speech through style diffusion and adversarial training with large speech language models. In: International Conference on Neural Information Processing Systems (2023)"},{"key":"14_CR18","doi-asserted-by":"publisher","unstructured":"Li, Y.A., Zare, A., Mesgarani, N.: StarGANv2-VC: a diverse, unsupervised, non-parallel framework for natural-sounding voice conversion. In: Interspeech, pp. 1349\u20131353 (2021). https:\/\/doi.org\/10.21437\/Interspeech.2021-319","DOI":"10.21437\/Interspeech.2021-319"},{"key":"14_CR19","doi-asserted-by":"publisher","unstructured":"Matou\u0161ek, J., Tihelka, D.: On comparison of phonetic representations for Czech neural speech synthesis. In: Text, Speech and Dialogue, vol. 13502. LNAI, pp. 410\u2013422. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-16270-1_34","DOI":"10.1007\/978-3-031-16270-1_34"},{"key":"14_CR20","doi-asserted-by":"publisher","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision (2022). https:\/\/doi.org\/10.48550\/ARXIV.2212.04356","DOI":"10.48550\/ARXIV.2212.04356"},{"key":"14_CR21","unstructured":"Rezende, D., Mohamed, S.: Variational inference with normalizing flows. In: Bach, F., Blei, D. (eds.) International Conference on Machine Learning, vol.\u00a037, pp. 1530\u20131538 (2015)"},{"key":"14_CR22","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-99-0827-1","volume-title":"Neural Text-to-Speech Synthesis","author":"X Tan","year":"2023","unstructured":"Tan, X.: Neural Text-to-Speech Synthesis. Springer, Cham (2023). https:\/\/doi.org\/10.1007\/978-981-99-0827-1"},{"key":"14_CR23","doi-asserted-by":"publisher","unstructured":"The Nguyen, L., Pham, T., Nguyen, D.Q.: XPhoneBERT: a pre-trained multilingual model for phoneme representations for text-to-speech. In: Interspeech, pp. 5506\u20135510 (2023). https:\/\/doi.org\/10.21437\/Interspeech.2023-444","DOI":"10.21437\/Interspeech.2023-444"},{"key":"14_CR24","doi-asserted-by":"publisher","unstructured":"Zhang, G., et al.: Mixed-phoneme BERT: improving BERT with mixed phoneme and sup-phoneme representations for text to speech. In: Interspeech, pp. 456\u2013460 (2022). https:\/\/doi.org\/10.21437\/Interspeech.2022-621","DOI":"10.21437\/Interspeech.2022-621"},{"key":"14_CR25","doi-asserted-by":"publisher","unstructured":"Zhu, J., Zhang, C., Jurgens, D.: ByT5 model for massively multilingual grapheme-to-phoneme conversion. In: Interspeech, pp. 446\u2013450 (2022). https:\/\/doi.org\/10.21437\/Interspeech.2022-538","DOI":"10.21437\/Interspeech.2022-538"},{"key":"14_CR26","doi-asserted-by":"publisher","unstructured":"Zhuang, L., Wayne, L., Ya, S., Jun, Z.: A robustly optimized BERT pre-training approach with post-training. In: Li, S., et al. (eds.) Chinese National Conference on Computational Linguistics, pp. 1218\u20131227 (2021). https:\/\/doi.org\/10.1007\/978-3-030-84186-7_31","DOI":"10.1007\/978-3-030-84186-7_31"},{"key":"14_CR27","doi-asserted-by":"publisher","first-page":"3466","DOI":"10.1109\/TASLP.2024.3426332","volume":"32","author":"M \u0158ez\u00e1\u010dkov\u00e1","year":"2024","unstructured":"\u0158ez\u00e1\u010dkov\u00e1, M., Tihelka, D., Matou\u0161ek, J.: T5G2P: text-to-text transfer transformer based grapheme-to-phoneme conversion. IEEE\/ACM Trans. Audio, Speech, Lang. Process. 32, 3466\u20133476 (2024). https:\/\/doi.org\/10.1109\/TASLP.2024.3426332","journal-title":"IEEE\/ACM Trans. Audio, Speech, Lang. Process."}],"container-title":["Lecture Notes in Computer Science","Text, Speech, and Dialogue"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-02548-7_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T05:39:40Z","timestamp":1755754780000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-02548-7_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,22]]},"ISBN":["9783032025470","9783032025487"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-02548-7_14","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025,8,22]]},"assertion":[{"value":"22 August 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"TSD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Text, Speech, and Dialogue","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Erlangen","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"tsd2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.kiv.zcu.cz\/tsd2025\/index.php","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}