{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:22:16Z","timestamp":1760314936378,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":22,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032079589","type":"print"},{"value":"9783032079596","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:00:00Z","timestamp":1760313600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:00:00Z","timestamp":1760313600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-07959-6_4","type":"book-chapter","created":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T09:22:04Z","timestamp":1760260924000},"page":"45-59","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Enhancing Speech Recognition Through Text-to-Speech and\u00a0Voice Conversion Augmentation"],"prefix":"10.1007","author":[{"given":"Yunus Emre","family":"Ozkose","sequence":"first","affiliation":[]},{"given":"Ali","family":"Haznedaroglu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,10,13]]},"reference":[{"key":"4_CR1","unstructured":"Ardila, R., et al.: Common voice: a massively-multilingual speech corpus (2020). https:\/\/arxiv.org\/abs\/1912.06670"},{"key":"4_CR2","doi-asserted-by":"crossref","unstructured":"Baas, M., Kamper, H.: Voice conversion can improve asr in very low-resource settings. In: Interspeech (2021). https:\/\/api.semanticscholar.org\/CorpusID:242757430","DOI":"10.21437\/Interspeech.2022-112"},{"key":"4_CR3","doi-asserted-by":"crossref","unstructured":"Baas, M., van Niekerk, B., Kamper, H.: Voice conversion with just nearest neighbors (2023). https:\/\/arxiv.org\/abs\/2305.18975","DOI":"10.21437\/Interspeech.2023-419"},{"key":"4_CR4","doi-asserted-by":"crossref","unstructured":"Casanova, Eet al.: Asr data augmentation in low-resource settings using cross-lingual multi-speaker tts and cross-lingual voice conversion (2023). https:\/\/arxiv.org\/abs\/2204.00618","DOI":"10.21437\/Interspeech.2023-496"},{"key":"4_CR5","doi-asserted-by":"crossref","unstructured":"Gulati, A., et al.: Conformer: convolution-augmented transformer for speech recognition (2020). https:\/\/arxiv.org\/abs\/2005.08100","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"4_CR6","unstructured":"Guo, Y., et al.: vec2wav 2.0: advancing voice conversion via discrete token vocoders (2024). https:\/\/arxiv.org\/abs\/2409.01995"},{"key":"4_CR7","unstructured":"Jia, Y., et\u00a0al.: Transfer learning from speaker verification to multispeaker text-to-speech synthesis. Adv. Neural Inf. Process. Syst. 31 (2018)"},{"key":"4_CR8","doi-asserted-by":"publisher","first-page":"1703","DOI":"10.1162\/tacl_a_00618","volume":"11","author":"E Kharitonov","year":"2023","unstructured":"Kharitonov, E., et al.: Speak, read and prompt: high-fidelity text-to-speech with minimal supervision. Trans. Assoc. Comput. Linguist. 11, 1703\u20131718 (2023)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"4_CR9","unstructured":"Kim, J., Kong, J., Son, J.: Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech. In: International Conference on Machine Learning, pp. 5530\u20135540. PMLR (2021)"},{"key":"4_CR10","first-page":"17022","volume":"33","author":"J Kong","year":"2020","unstructured":"Kong, J., Kim, J., Bae, J.: Hifi-gan: generative adversarial networks for efficient and high fidelity speech synthesis. Adv. Neural. Inf. Process. Syst. 33, 17022\u201317033 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"4_CR11","unstructured":"Li, J., Gadde, R., Ginsburg, B., Lavrukhin, V.: Training neural speech recognition systems with synthetic speech augmentation. arXiv preprint arXiv:1811.00707 (2018)"},{"key":"4_CR12","unstructured":"Liu, S.: Zero-shot voice conversion with diffusion transformers (2024). https:\/\/arxiv.org\/abs\/2411.09943"},{"key":"4_CR13","unstructured":"Microsoft: Azure text-to-speech (2023). https:\/\/azure.microsoft.com\/services\/cognitive-services\/text-to-speech\/. Accessed 13 May 2025"},{"key":"4_CR14","doi-asserted-by":"publisher","unstructured":"Mittag, G., Naderi, B., Chehadi, A., M\u00f6ller, S.: Nisqa: a deep cnn-self-attention model for multidimensional speech quality prediction with crowdsourced datasets. In: Interspeech. ISCA (2021). https:\/\/doi.org\/10.21437\/interspeech.2021-299. http:\/\/dx.doi.org\/10.21437\/Interspeech.2021-299","DOI":"10.21437\/interspeech.2021-299"},{"key":"4_CR15","unstructured":"Ogun, S., Colotte, V., Vincent, E.: An exhaustive evaluation of tts- and vc-based data augmentation for asr (2025). https:\/\/arxiv.org\/abs\/2503.08954"},{"key":"4_CR16","unstructured":"Oord, A.V.D., et al.: Wavenet: a generative model for raw audio. arXiv preprint arXiv:1609.03499 (2016)"},{"key":"4_CR17","doi-asserted-by":"crossref","unstructured":"Panayotov, V., Chen, G., Povey, D., Khudanpur, S.: Librispeech: an asr corpus based on public domain audio books. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5206\u20135210. IEEE (2015)","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"4_CR18","unstructured":"Qian, K., Zhang, Y., Chang, S., Yang, X., Hasegawa-Johnson, M.: Autovc: zero-shot voice style transfer with only autoencoder loss. In: International Conference on Machine Learning, pp. 5210\u20135219. PMLR (2019)"},{"key":"4_CR19","unstructured":"Ren, Y., et al.: Fastspeech 2: fast and high-quality end-to-end text to speech. arXiv preprint arXiv:2006.04558 (2020)"},{"key":"4_CR20","doi-asserted-by":"crossref","unstructured":"Shen, J., et\u00a0al.: Natural tts synthesis by conditioning wavenet on mel spectrogram predictions. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4779\u20134783. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"4_CR21","unstructured":"Wang, C., et\u00a0al.: Neural codec language models are zero-shot text to speech synthesizers. arXiv preprint arXiv:2301.02111 (2023)"},{"key":"4_CR22","unstructured":"Yao, Z., et al.: Zipformer: a faster and better encoder for automatic speech recognition (2024). https:\/\/arxiv.org\/abs\/2310.11230"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-07959-6_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T09:22:13Z","timestamp":1760260933000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-07959-6_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,13]]},"ISBN":["9783032079589","9783032079596"],"references-count":22,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-07959-6_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,10,13]]},"assertion":[{"value":"13 October 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"SPECOM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Speech and Computer","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Szeged","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hungary","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"specom2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/specom.inf.u-szeged.hu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}