{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:22:24Z","timestamp":1760314944779,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":21,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032079589","type":"print"},{"value":"9783032079596","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:00:00Z","timestamp":1760313600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:00:00Z","timestamp":1760313600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-07959-6_9","type":"book-chapter","created":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T09:22:00Z","timestamp":1760260920000},"page":"118-129","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Improving Whisper-Based Serbian ASR Using Synthetic Speech"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2517-3728","authenticated-orcid":false,"given":"Vuk","family":"Stanojev","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3707-0286","authenticated-orcid":false,"given":"Tijana","family":"Nosek","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0511-6729","authenticated-orcid":false,"given":"Sini\u0161a","family":"Suzi\u0107","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3026-8086","authenticated-orcid":false,"given":"Darko","family":"Pekar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4558-9918","authenticated-orcid":false,"given":"Vlado","family":"Deli\u0107","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3426-3277","authenticated-orcid":false,"given":"Milan","family":"Se\u010dujski","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,10,13]]},"reference":[{"key":"9_CR1","doi-asserted-by":"publisher","first-page":"325","DOI":"10.1109\/TASLP.2023.3328283","volume":"32","author":"R Prabhavalkar","year":"2023","unstructured":"Prabhavalkar, R., Hori, T., Sainath, T.N., Schl\u00fcter, R., Watanabe, S.: End-to-end speech recognition: a survey. IEEE\/ACM Trans. Audio Speech Lang. Process. 32, 325\u2013351 (2023)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"9_CR2","doi-asserted-by":"crossref","unstructured":"Mimura, M., Ueno, S., Inaguma, H., Sakai, S., Kawahara, T.: Leveraging sequence-to-sequence speech synthesis for enhancing acoustic-to-word speech recognition. In: 2018 IEEE Spoken Language Technology Workshop (SLT), pp. 477\u2013484. IEEE (2018)","DOI":"10.1109\/SLT.2018.8639589"},{"key":"9_CR3","doi-asserted-by":"crossref","unstructured":"Shen, J., et al.: Natural TTS synthesis by conditioning wavenet on MEL spectrogram predictions. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4779\u20134783. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"9_CR4","doi-asserted-by":"crossref","unstructured":"Zheng, X., Liu, Y., Gunceler, D., Willett, D.: Using synthetic audio to improve the recognition of out-of-vocabulary words in end-to-end ASR systems. In: 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), ICASSP 2021, pp. 5674\u20135678. IEEE (2021)","DOI":"10.1109\/ICASSP39728.2021.9414778"},{"key":"9_CR5","unstructured":"Aich, A.: Elastic weight consolidation (EWC): nuts and bolts. arXiv preprint arXiv:2105.04093 (2021)"},{"key":"9_CR6","doi-asserted-by":"crossref","unstructured":"Fazel, A., et al.: Synthasr: unlocking synthetic data for speech recognition. arXiv preprint arXiv:2106.07803 (2021)","DOI":"10.21437\/Interspeech.2021-1882"},{"key":"9_CR7","doi-asserted-by":"crossref","unstructured":"Ueno, S., Mimura, M., Sakai, S., Kawahara, T.: Multi-speaker sequence-to-sequence speech synthesis for data augmentation in acoustic-to-word speech recognition. In: 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), ICASSP 2019, pp. 6161\u20136165. IEEE (2019)","DOI":"10.1109\/ICASSP.2019.8682816"},{"key":"9_CR8","doi-asserted-by":"crossref","unstructured":"Yuen, K.C., Li, H., Siong, C.E.: ASR model adaptation for rare words using synthetic data generated by multiple text-to-speech systems. In: 2023 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC), pp. 1771\u20131778. IEEE (2023)","DOI":"10.1109\/APSIPAASC58517.2023.10317116"},{"key":"9_CR9","doi-asserted-by":"crossref","unstructured":"Ueno, S., Mimura, M., Sakai, S., Kawahara, T.: Data augmentation for ASR using TTS via a discrete representation. In: 2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp. 68\u201375. IEEE (2021)","DOI":"10.1109\/ASRU51503.2021.9688218"},{"key":"9_CR10","doi-asserted-by":"crossref","unstructured":"V\u00e1squez-Correa, J.C., Arzelus, H., Martin-Do\u00f1as, J.M., Arellano, J., Gonzalez-Docasal, A., \u00c1lvarez, A.: When whisper meets TTS: domain adaptation using only synthetic speech data. In: International Conference on Text, Speech, and Dialogue, pp. 226\u2013238. Springer, Cham (2023)","DOI":"10.1007\/978-3-031-40498-6_20"},{"key":"9_CR11","doi-asserted-by":"crossref","unstructured":"Cornell, S., Darefsky, J., Duan, Z., Watanabe, S.: Generating data with text-to-speech and large-language models for conversational speech recognition. arXiv preprint arXiv:2408.09215 (2024)","DOI":"10.21437\/SynData4GenAI.2024-2"},{"key":"9_CR12","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision. In: International Conference on Machine Learning, pp. 28492\u201328518. PMLR (2023)"},{"key":"9_CR13","doi-asserted-by":"crossref","unstructured":"Ljube\u0161i\u0107, N., Klubi\u010dka, F.: {BS, HR, SR} WAC-web corpora of Bosnian, Croatian and Serbian. In: Proceedings of the 9th Web as Corpus Workshop (WaC-9), pp. 29\u201335 (2014)","DOI":"10.3115\/v1\/W14-0405"},{"key":"9_CR14","doi-asserted-by":"publisher","unstructured":"Vlado, D., Milan, S., Nik\u0161a, J., Marko, J., Radovan, O., Darko, P.: Speech technologies for Serbian and Kindred South Slavic languages. In: Shabtai, N.R. (ed.) Advances in Speech Recognition, SCIYO, pp. 141\u2013164 (2010). https:\/\/doi.org\/10.5772\/10115. ISBN 978-953-307-097-1","DOI":"10.5772\/10115"},{"key":"9_CR15","unstructured":"Babi\u0107, V.: The frequency of lowercase and uppercase letters, bigrams, and trigrams in the Serbian language. InfoM, no. 79\u201380\/2024, pp. 22\u201326 (2024)"},{"key":"9_CR16","unstructured":"Se\u010dujski, M.: Accentuation dictionary of Serbian intended for text-to-speech synthesis. In: Proceedings of the Digital Image and Signal Processing Conference on DOGS 2002, Be\u010dej, Serbia, pp. 17\u201320 (2002). (in Serbian)"},{"key":"9_CR17","doi-asserted-by":"crossref","unstructured":"Suzi\u0107, S., Pekar, D., Se\u010dujski, M., Nosek, T., Deli\u0107, V.: HiFi-GAN based text-to-speech synthesis in Serbian. In: 2022 30th European Signal Processing Conference (EUSIPCO), pp. 2231\u20132235. IEEE (2022)","DOI":"10.23919\/EUSIPCO55093.2022.9909548"},{"key":"9_CR18","first-page":"17022","volume":"33","author":"J Kong","year":"2020","unstructured":"Kong, J., Kim, J., Bae, J.: Hifi-GAN: generative adversarial networks for efficient and high fidelity speech synthesis. Adv. Neural. Inf. Process. Syst. 33, 17022\u201317033 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"9_CR19","unstructured":"Wagner, L., Thallinger, B., Zusag, M.: CrisperWhisper: accurate timestamps on verbatim speech transcriptions. arXiv preprint, arXiv:2408.16589 (2024)"},{"key":"9_CR20","doi-asserted-by":"crossref","unstructured":"Wu, T., et al.: A brief overview of ChatGPT: the history, status quo and potential future development. IEEE\/CAA J. Automatica Sinica 10(5), 1122\u20131136 (2023)","DOI":"10.1109\/JAS.2023.123618"},{"key":"9_CR21","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-07959-6_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T09:22:10Z","timestamp":1760260930000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-07959-6_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,13]]},"ISBN":["9783032079589","9783032079596"],"references-count":21,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-07959-6_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,10,13]]},"assertion":[{"value":"13 October 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"SPECOM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Speech and Computer","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Szeged","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hungary","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"specom2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/specom.inf.u-szeged.hu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}