{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,22]],"date-time":"2025-12-22T22:12:16Z","timestamp":1766441536448,"version":"3.40.3"},"publisher-location":"Cham","reference-count":121,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031779602"},{"type":"electronic","value":"9783031779619"}],"license":[{"start":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T00:00:00Z","timestamp":1732233600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T00:00:00Z","timestamp":1732233600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-77961-9_2","type":"book-chapter","created":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T13:54:27Z","timestamp":1732197267000},"page":"23-42","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Retrospective and Perspectives of TTS &amp; STT Technology Development and Implementation for South Slavic Under-Resourced Languages"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3426-3277","authenticated-orcid":false,"given":"Milan","family":"Se\u010dujski","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5413-1028","authenticated-orcid":false,"given":"Branislav","family":"Popovi\u0107","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3026-8086","authenticated-orcid":false,"given":"Darko","family":"Pekar","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7283-3939","authenticated-orcid":false,"given":"Nik\u0161a","family":"Jakovljevi\u0107","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5207-6638","authenticated-orcid":false,"given":"Edvin","family":"Pakoci","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0511-6729","authenticated-orcid":false,"given":"Sini\u0161a","family":"Suzi\u0107","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3707-0286","authenticated-orcid":false,"given":"Tijana","family":"Nosek","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0748-4672","authenticated-orcid":false,"given":"Nikola","family":"Simi\u0107","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2517-3728","authenticated-orcid":false,"given":"Vuk","family":"Stanojev","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4558-9918","authenticated-orcid":false,"given":"Vlado","family":"Deli\u0107","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,22]]},"reference":[{"key":"2_CR1","doi-asserted-by":"crossref","unstructured":"Deli\u0107, V., et al.: Speech technology progress based on new machine learning paradigm. Computational Intelligensce and Neuroscience, Wiley, Article 4368036, 19 pages (2019)","DOI":"10.1155\/2019\/4368036"},{"key":"2_CR2","doi-asserted-by":"publisher","first-page":"85","DOI":"10.1016\/j.specom.2013.07.008","volume":"56","author":"L Besacier","year":"2014","unstructured":"Besacier, L., Barnard, E., Karpov, A., Schultz, T.: Automatic speech recognition for under-resourced languages: a survey. Speech Commun. 56, 85\u2013100 (2014)","journal-title":"Speech Commun."},{"key":"2_CR3","doi-asserted-by":"crossref","unstructured":"Swietojanski, P., Ghoshal, A., Renals, S.: Unsupervised crosslingual knowledge transfer in DNN-based LVCSR. In: Workshop SLT, pp. 246\u2013251. IEEE, Miami, FL, USA (2012)","DOI":"10.1109\/SLT.2012.6424230"},{"key":"2_CR4","unstructured":"Tan, X., Qin, T., Soong, F., Liu, T.Y.: A Survey on Neural Speech Synthesis. arXiv preprint arXiv:2106.15561 (2021)"},{"key":"2_CR5","unstructured":"Dutoit, T.: High Quality Text-To-Speech Synthesis of the French Language. Ph.D. dissertation. Supervised by Prof. Henri Leich. Facult\u00e9 Polytechnique de Mons. (1993)"},{"key":"2_CR6","unstructured":"Teranishi R., Umeda N.: Use of pronouncing dictionary in speech synthesis experiments. In: Reports of the Sixth International Congress on Acoustics, vol. 2, pp. 155\u2013158 (1968)"},{"issue":"4","key":"2_CR7","first-page":"5","volume":"7","author":"WI Hallahan","year":"1995","unstructured":"Hallahan, W.I.: DECtalk Software: text-to-speech technology and implementation. Digit. Tech. J. 7(4), 5\u201319 (1995)","journal-title":"Digit. Tech. J."},{"key":"2_CR8","volume-title":"An Introduction to Text-to-Speech Synthesis","author":"T Dutoit","year":"1999","unstructured":"Dutoit, T.: An Introduction to Text-to-Speech Synthesis. Kluwer Academic Publishers, Dordrecht, Boston, London (1999)"},{"issue":"2","key":"2_CR9","doi-asserted-by":"publisher","first-page":"95","DOI":"10.1006\/csla.1994.1005","volume":"8","author":"J Van Santen","year":"1994","unstructured":"Van Santen, J.: Assignment of segmental duration in text-to-speech synthesis. Comput. Speech Lang. 8(2), 95\u2013128 (1994)","journal-title":"Comput. Speech Lang."},{"key":"2_CR10","unstructured":"Sejnowski, T., Rosenberg, C.R.: Parallel networks that learn to pronounce English text. Complex Syst.1, 145\u2013168 (1987)"},{"key":"2_CR11","doi-asserted-by":"crossref","unstructured":"McCulloch, N., Bedworth, M., Bridle J.: NETspeak \u2013 a re-implementation of NETtalk. Comput. Speech Lang. 2, 289\u2013301 (1987)","DOI":"10.1016\/0885-2308(87)90013-1"},{"key":"2_CR12","unstructured":"Ronanki, S.: Prosody Generation for Text-to-Speech Synthesis. Ph.D. thesis, University of Edinburgh (2019)"},{"key":"2_CR13","doi-asserted-by":"crossref","unstructured":"Sagisaka, Y., Kaiki, N., Iwahashi, N., Mimura, K.: ATR v-TALK speech synthesis system. In: Proceedings of International Conference on Spoken Language Processing, pp. 483\u2013486 (1992)","DOI":"10.21437\/ICSLP.1992-125"},{"key":"2_CR14","doi-asserted-by":"crossref","unstructured":"Donovan, R.E., Eide, E.: The IBM trainable speech synthesis system. In: Proceedings of 5th International Conference on Spoken Language Processing (ICSLP 98), p. 4, ISCA, Sydney, Australia (1998)","DOI":"10.21437\/ICSLP.1998-10"},{"key":"2_CR15","doi-asserted-by":"crossref","unstructured":"Hunt A.J., Black A.W.: Unit selection in a concatenative speech synthesis system using a large speech database. In: Proceedings of ICASSP, pp. 373\u2013376. IEEE, Atlanta, GA, USA (1996)","DOI":"10.1109\/ICASSP.1996.541110"},{"key":"2_CR16","doi-asserted-by":"crossref","unstructured":"Yoshimura, T., Tokuda, K., Masuko, T., Kobayashi, T., Kitamura T.: Simultaneous modeling of spectrum, pitch and duration in HMM-based speech synthesis, In: Proceedings of the 6th EUROSPEECH, pp. 2347\u20132350. Budapest, Hungary (1999)","DOI":"10.21437\/Eurospeech.1999-596"},{"key":"2_CR17","doi-asserted-by":"crossref","unstructured":"Yamagishi, J., Kobayashi, T., Nakano, Y., Ogata, K., Isogai J.: Analysis of speaker adaptation algorithms for HMM-based speech synthesis and a constrained SMAPLR adaptation algorithm. IEEE Trans. Audio Speech Lang. Process. 17(s1), 66\u201383 (2009)","DOI":"10.1109\/TASL.2008.2006647"},{"key":"2_CR18","doi-asserted-by":"crossref","unstructured":"Yamagishi, J., Onishi, K., Masuko, T., Kobayashi, T.: Modeling of various speaking styles and emotions for HMM-based speech synthesis. In: Proceedings of the 10th EUROSPEECH, pp. 2461\u20132464. Geneva, Switzerland (2003)","DOI":"10.21437\/Eurospeech.2003-676"},{"issue":"6","key":"2_CR19","doi-asserted-by":"publisher","first-page":"1231","DOI":"10.1109\/TASL.2009.2015708","volume":"17","author":"Y Qian","year":"2009","unstructured":"Qian, Y., Liang, H., Soong, F.K.: A cross-language state sharing and mapping approach to bilingual (Mandarin-English) TTS. IEEE Trans. Audio Speech Lang. Process. 17(6), 1231\u20131239 (2009)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"issue":"5","key":"2_CR20","doi-asserted-by":"publisher","first-page":"1234","DOI":"10.1109\/JPROC.2013.2251852","volume":"101","author":"K Tokuda","year":"2013","unstructured":"Tokuda, K., Nankaku, Y., Toda, T., Zen, H., Yamagishi, J., Oura, K.: Speech synthesis based on hidden markov models. Proc. IEEE 101(5), 1234\u20131252 (2013)","journal-title":"Proc. IEEE"},{"key":"2_CR21","doi-asserted-by":"crossref","unstructured":"Yan, Z.-J., Qian, Y., Soong, F.K.: Rich-context unit selection (RUS) approach to high quality TTS. In: Proceedings of ICASSP, pp. 4798\u20134801. IEEE (2010)","DOI":"10.1109\/ICASSP.2010.5495150"},{"key":"2_CR22","doi-asserted-by":"crossref","unstructured":"Qian, Y., Soong, F.K., Yan, Z.J.: A unified trajectory tiling approach to high quality speech rendering. IEEE Trans. Audio Speech Lang. Process. 21(2), 280\u2013290 (2013)","DOI":"10.1109\/TASL.2012.2221460"},{"key":"2_CR23","doi-asserted-by":"crossref","unstructured":"Weijters, T., Thole, J.: Speech synthesis with artificial neural networks. In: Proceedings of the IEEE International Conference on Neural Networks, pp. 1764\u20131769, San Francisco, CA, USA (1993)","DOI":"10.1109\/ICNN.1993.298824"},{"key":"2_CR24","doi-asserted-by":"crossref","unstructured":"Zen, H., Senior, A., Schuster, M.: Statistical parametric speech synthesis using deep neural networks. In: Proceedings of the ICASSP, pp. 7962\u20137966. IEEE (2013)","DOI":"10.1109\/ICASSP.2013.6639215"},{"key":"2_CR25","doi-asserted-by":"crossref","unstructured":"Fan, Y., Qian, Y., Xie, F.L., Soong, F.K.: TTS synthesis with bidirectional LSTM based recurrent neural networks. In: Proceedings of 15th INTERSPEECH, pp. 1964\u20131968. ISCA, Singapore (2014)","DOI":"10.21437\/Interspeech.2014-443"},{"issue":"1","key":"2_CR26","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1109\/TASLP.2017.2761547","volume":"26","author":"Y Saito","year":"2018","unstructured":"Saito, Y., Takamichi, S., Saruwatari, H.: Statistical parametric speech synthesis incorporating generative adversarial networks. IEEE\/ACM Trans. Audio Speech Lang. Process. 26(1), 84\u201396 (2018)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"issue":"7","key":"2_CR27","doi-asserted-by":"publisher","first-page":"1255","DOI":"10.1109\/TASLP.2016.2551865","volume":"24","author":"Z Wu","year":"2016","unstructured":"Wu, Z., King, S.: Improving trajectory modelling for DNN-based speech synthesis by using stacked bottleneck features and minimum generation error training. IEEE\/ACM Trans. Audio Speech Lang. Process. 24(7), 1255\u20131265 (2016)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"2_CR28","doi-asserted-by":"crossref","unstructured":"Fan, Y., Qian, Y., Soong, F.K., He, L.: Multi-speaker modeling and speaker adaptation for DNN-based TTS synthesis. In: Proceedings of ICASSP, pp. 4475\u20134479. IEEE (2015)","DOI":"10.1109\/ICASSP.2015.7178817"},{"key":"2_CR29","doi-asserted-by":"crossref","unstructured":"Wu, Z., Swietojanski, P., Veaux, C., Renals, S., King, S.: A study of speaker adaptation for DNN-based speech synthesis. In: Proceedings of the 16th INTERSPEECH, pp. 879\u2013883, Dresden (2015)","DOI":"10.21437\/Interspeech.2015-270"},{"key":"2_CR30","doi-asserted-by":"crossref","unstructured":"Hojo, N., Ijima, Y., Mizuno, H.: An investigation of DNN-based speech synthesis using speaker codes. In: Proceedings of the 17th INTERSPEECH 2016, pp. 2278\u20132282. San Francisco, USA (2016)","DOI":"10.21437\/Interspeech.2016-589"},{"key":"2_CR31","doi-asserted-by":"crossref","unstructured":"Fan, Y., Qian, Y., Soong, F.K., He, L.: Multi-speaker modeling and speaker adaptation for DNN-based TTS synthesis. In: Proceedings of ICASSP, pp. 4475\u20134479. Brisbane, Australia (2015)","DOI":"10.1109\/ICASSP.2015.7178817"},{"key":"2_CR32","doi-asserted-by":"crossref","unstructured":"Brave, S., Nass, C.: Emotion in human-computer interaction. In: Sears, A., Jacko, J.A. (eds.) Human-Computer Interaction Fundamentals, pp. 53\u201368, CRC, Boca Raton, USA (2009)","DOI":"10.1201\/b10368-6"},{"key":"2_CR33","doi-asserted-by":"crossref","unstructured":"Yamagishi, J., Onishi, K., Masuko, T., Kobayashi, T.: Modeling of various speaking styles and emotions for HMM-based speech synthesis. In: 8th EUROSPEECH, Geneva, Switzerland (2003)","DOI":"10.21437\/Eurospeech.2003-676"},{"key":"2_CR34","doi-asserted-by":"crossref","unstructured":"Eyben, F., et al.: Unsupervised clustering of emotion and voice styles for expressive TTS. In: Proceedings of ICASSP, pp. 4009\u20134012. IEEE (2012)","DOI":"10.1109\/ICASSP.2012.6288797"},{"issue":"5","key":"2_CR35","doi-asserted-by":"publisher","first-page":"134","DOI":"10.5923\/j.ajsp.20120205.06","volume":"2","author":"R Aihara","year":"2012","unstructured":"Aihara, R., Takashima, R., Takiguchi, T., Ariki, Y.: GMM-based emotional voice conversion using spectrum and prosody features. Am. J. Signal Process. 2(5), 134\u2013138 (2012)","journal-title":"Am. J. Signal Process."},{"key":"2_CR36","doi-asserted-by":"publisher","first-page":"135","DOI":"10.1016\/j.specom.2018.03.002","volume":"99","author":"J Lorenzo-Trueba","year":"2018","unstructured":"Lorenzo-Trueba, J., Henter, G.E., Takaki, S., Yamagishi, J., Morino, Y., Ochiai, Y.: Investigating different representations for modeling and controlling multiple emotions in DNN-based speech synthesis. Speech Commun. 99, 135\u2013143 (2018)","journal-title":"Speech Commun."},{"key":"2_CR37","doi-asserted-by":"crossref","unstructured":"Luo, Z., Chen, J., Takiguchi, T., Ariki, Y.: Emotional voice conversion with adaptive scales F0 based on wavelet transform using limited amount of emotional data. In: Proceedings of the 18th INTERSPEECH, pp. 3399\u20133403. ISCA (2017)","DOI":"10.21437\/Interspeech.2017-984"},{"key":"2_CR38","doi-asserted-by":"crossref","unstructured":"Ming, H., Huang, D., Xie, L., Wu, J., Dong, M., Li, H.: Deep bidirectional LSTM modeling of timbre and prosody for emotional voice conversion. In: Proceedings of the 17th INTERSPEECH 2016, pp. 2453\u20132457. ISCA (2016)","DOI":"10.21437\/Interspeech.2016-1053"},{"key":"2_CR39","doi-asserted-by":"crossref","unstructured":"An, S., Ling, Z., Dai, L.: Emotional statistical parametric speech synthesis using LSTM-RNNS. In: Asia-Pacific Signal and Information Processing Association Annual Samit and Conference (APSIPA ASC), pp. 1613\u20131616, IEEE (2017)","DOI":"10.1109\/APSIPA.2017.8282282"},{"key":"2_CR40","unstructured":"Skerry-Ryan, R., et al.: Towards end-to-end prosody transfer for expressive speech synthesis with Tacotron. In: Proceedings of the 34th International Conference on Machine Learning, pp. 4693\u20134702. PMLR (2018)"},{"key":"2_CR41","doi-asserted-by":"crossref","unstructured":"Wu, P., Ling, Z., Liu, L., Jiang, Y., Wu, H., Dai, L.: End-to-end emotional speech synthesis using style tokens and semisupervised training. In: Asia-Pacific Signal and Information Processing Association Annual Samit and Conf. (APSIPA ASC), pp. 623\u2013627. IEEE (2019)","DOI":"10.1109\/APSIPAASC47483.2019.9023186"},{"issue":"4","key":"2_CR42","doi-asserted-by":"publisher","first-page":"3120","DOI":"10.1109\/TAFFC.2022.3233324","volume":"14","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Sisman, B., Rana, R., Schuller, B.W., Li, H.: Speech synthesis with mixed emotions. IEEE Trans. Affect. Comput. 14(4), 3120\u20133134 (2022)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"2_CR43","unstructured":"Van den Oord, A., Dieleman, S., Zen, H., et al.: WaveNet: a generative model for raw audio. arXiv preprint arXiv:1609.03499 12 (2016)"},{"key":"2_CR44","unstructured":"Van den Oord, A., et al.: Parallel WaveNet: fast high- fidelity speech synthesis. In: Proceedings of the 35th International Conference on Machine Learning, pp. 3915\u20133923. Stockholm, Sweden (2018)"},{"key":"2_CR45","unstructured":"Arik, S.O., et al.: Deep voice: real-time neural text-to-speech. In: Proceedings of the 34th International Conference on Machine Learning, pp. 195\u2013204. PMLR, Sydney, Australia (2017)"},{"key":"2_CR46","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: Tacotron: towards end-to-end speech synthesis. In: Proceedings of the 18th INTERSPEECH 2017, pp. 4006\u20134010. ISCA, Stockholm, Sweden (2017)","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"2_CR47","doi-asserted-by":"crossref","unstructured":"Shen, J., et al.: Natural TTS synthesis by conditioning WaveNet on MEL spectrogram predictions. In: Proceedings of ICASSP, pp. 4779\u20134783. Calgary, Canada (2018)","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"2_CR48","unstructured":"Ping, W., Peng, K., Gibiansky, A., et al.: Deep voice 3: scaling text-to-speech with convolutional sequence learning. arXiv preprint arXiv:1710.07654 (2017)"},{"key":"2_CR49","unstructured":"Arik, S.\u00d6, Chen, J., Peng, K., Ping, W., Zhou, Y.: Neural voice cloning with a few samples. In: Advances in Neural Information Processing Systems 31, 32nd Conference on Neural Information Processing Systems, pp. 10040\u201310050, Montreal, Canada (2018)"},{"key":"2_CR50","unstructured":"Nachmani, E., Polyak, A., Taigman, Y., Wolf, L.: Fitting new speakers based on a short untranscribed sample. In: Proceedings of the 35th International Conference on Machine Learning, pp. 3680\u20133688. Stockholm, Sweden (2018)"},{"key":"2_CR51","doi-asserted-by":"crossref","unstructured":"Akuzawa, K., Iwasawa, Y., Matsuo, Y.: Expressive speech synthesis via modeling expressions with variational autoencoder. In: Proceedings of the 19th INTERSPEECH, pp. 3067\u20133071. ISCA, Hyderabad, India (2018)","DOI":"10.21437\/Interspeech.2018-1113"},{"key":"2_CR52","unstructured":"Ren, Y., et al.: Fastspeech: fast, robust and controllable text to speech. Adv. Neural Inf. Process. systems 32 (2019)"},{"key":"2_CR53","unstructured":"Ren, Y., et al.: Fastspeech 2: Fast and high-quality end-to-end text to speech. Preprint arXiv:2006.04558 (2020)"},{"key":"2_CR54","doi-asserted-by":"crossref","unstructured":"Nosek, T., Suzi\u0107, S., Se\u010dujski, M., Stanojev, V., Pekar, D., Deli\u0107, V.: End-to-end speech synthesis for the Serbian language based on Tacotron. In: Karpov, A. Deli\u0107, V., (eds.) SPECOM 2024, LNAI Part I - 15299, Springer, Heidelberg, Belgrade, Serbia (2024)","DOI":"10.1007\/978-3-031-77961-9_16"},{"key":"2_CR55","unstructured":"Wang, C., et al.: Neural Codec Language Models are Zero-Shot Text to Speech Synthesizers. arXiv preprint arXiv:2301.02111 (2023)"},{"key":"2_CR56","unstructured":"Zhang, Z., et al.: Speak foreign languages with your own voice: Cross-lingual neural codec language modeling. arXiv preprint arXiv:2303.03926 (2023)"},{"key":"2_CR57","unstructured":"Han, B., et al.: VALL-E R: Robust and Efficient Zero-Shot Text-to-Speech Synthesis via Monotonic Alignment. arXiv preprint arXiv:2406.07855 (2024)"},{"key":"2_CR58","unstructured":"Meng, L., et al.: Autoregressive Speech Synthesis without Vector Quantization. arXiv preprint arXiv:2407.08551 (2024)"},{"key":"2_CR59","unstructured":"Casanova, E., Weber, J., Shulby, C., Candido Junior, A., G\u00f6lge, E., Antonelli Ponti, M.: YourTTS: Towards Zero-Shot Multi-Speaker TTS and Zero-Shot Voice Conversion for everyone. arXiv preprint arXiv:2112.02418 (2024)"},{"key":"2_CR60","unstructured":"Kong, J., Kim, J., Bae, J.: HiFi-GAN: generative adversarial networks for efficient and high fidelity speech synthesis. arXiv preprint arXiv:2010.05646 (2020)"},{"key":"2_CR61","doi-asserted-by":"crossref","unstructured":"Prenger, R., Valle, R., Catanzaro, B.: WaveGlow: A Flow-based Generative Network for Speech Synthesis. arXiv preprint arXiv:1811.00002 (2018)","DOI":"10.1109\/ICASSP.2019.8683143"},{"key":"2_CR62","doi-asserted-by":"crossref","unstructured":"Casanova, E., et al.: XTTS: a Massively Multilingual Zero-Shot Text-to-Speech Model. arXiv preprint arXiv:2406.04904 (2024)","DOI":"10.21437\/Interspeech.2024-2016"},{"key":"2_CR63","doi-asserted-by":"crossref","unstructured":"Se\u010dujski, M., Obradovi\u0107, R., Pekar, D., Jovanov, Lj., Deli\u0107, V.: AlfaNum system for speech synthesis in Serbian language. In: Proceedings of the 5th International Conference Text, Speech and Dialogue (TSD 2002), pp. 237\u2013244. Brno, Czech Republic (2002)","DOI":"10.1007\/3-540-46154-X_32"},{"key":"2_CR64","unstructured":"Pakoci, E., Mak, R.: HMM-based speech synthesis for the Serbian language. In: Proceedings of the 56th ETRAN, vol. TE4, pp. 1\u20134. Zlatibor, Serbia (2012)"},{"issue":"1","key":"2_CR65","doi-asserted-by":"publisher","first-page":"32","DOI":"10.5937\/telfor1701032D","volume":"9","author":"T Deli\u0107","year":"2017","unstructured":"Deli\u0107, T., Se\u010dujski, M., Suzi\u0107, S.: A review of serbian parametric speech synthesis based on deep neural networks. TELFOR J. 9(1), 32\u201337 (2017)","journal-title":"TELFOR J."},{"issue":"4","key":"2_CR66","first-page":"434","volume":"26","author":"M Se\u010dujski","year":"2020","unstructured":"Se\u010dujski, M., Pekar, D., Suzi\u0107, S., Smirnov, A., Nosek, T.: Speaker\/style-dependent neural network speech synthesis based on speaker\/style embedding. J. Univ. Comput. Sci. 26(4), 434\u2013453 (2020)","journal-title":"J. Univ. Comput. Sci."},{"key":"2_CR67","doi-asserted-by":"crossref","unstructured":"Suzi\u0107, S., Se\u010dujski, M., Nosek, T., Deli\u0107, V., Pekar, D.: HiFi-GAN based text-to-speech synthesis in Serbian. In: Proceedings of 30th EUSIPCO, pp. 2231\u20132235, Belgrade, Serbia (2022)","DOI":"10.23919\/EUSIPCO55093.2022.9909548"},{"key":"2_CR68","doi-asserted-by":"publisher","first-page":"1664","DOI":"10.1121\/1.1936652","volume":"33","author":"T Sakai","year":"1961","unstructured":"Sakai, T., Doshita, S.: Phonetic Typewriter. J. Acoust. Soc. Am. 33, 1664 (1961)","journal-title":"J. Acoust. Soc. Am."},{"key":"2_CR69","doi-asserted-by":"publisher","first-page":"637","DOI":"10.1121\/1.1906946","volume":"24","author":"KH Davis","year":"1952","unstructured":"Davis, K.H., Biddulph, R., Balashek, S.: Automatic recognition of spoken digits. J. Acoust. Soc. Am. 24, 637\u2013642 (1952)","journal-title":"J. Acoust. Soc. Am."},{"key":"2_CR70","doi-asserted-by":"publisher","first-page":"52","DOI":"10.1007\/BF01074755","volume":"4","author":"TK Vintsyuk","year":"1972","unstructured":"Vintsyuk, T.K.: Speech discrimination by dynamic programming. Cybern. Syst. Anal. 4, 52\u201357 (1972)","journal-title":"Cybern. Syst. Anal."},{"key":"2_CR71","doi-asserted-by":"publisher","first-page":"43","DOI":"10.1109\/TASSP.1978.1163055","volume":"26","author":"H Sakoe","year":"1978","unstructured":"Sakoe, H., Chiba, S.: Dynamic programming algorithm optimization for spoken word recognition. IEEE Trans. Acoust. Speech Signal Process. 26, 43\u201349 (1978)","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"key":"2_CR72","doi-asserted-by":"publisher","first-page":"637","DOI":"10.1121\/1.1912679","volume":"50","author":"BS Atal","year":"1971","unstructured":"Atal, B.S., Hanauer, S.L.: Speech analysis and synthesis by linear prediction of the speech wave. J. Acoust. Soc. Am. 50, 637\u2013655 (1971)","journal-title":"J. Acoust. Soc. Am."},{"key":"2_CR73","doi-asserted-by":"publisher","first-page":"250","DOI":"10.1109\/TIT.1975.1055384","volume":"21","author":"F Jelinek","year":"1975","unstructured":"Jelinek, F., Bahl, L., Mercer, R.: Design of a linguistic statistical decoder for the recognition of continuous speech. IEEE Trans. Inf. Theory 21, 250\u2013256 (1975)","journal-title":"IEEE Trans. Inf. Theory"},{"key":"2_CR74","doi-asserted-by":"publisher","first-page":"1345","DOI":"10.1121\/1.381666","volume":"62","author":"DH Klatt","year":"1977","unstructured":"Klatt, D.H.: Review of the ARPA speech understanding project. J. Acoust. Soc. Am. 62, 1345\u20131366 (1977)","journal-title":"J. Acoust. Soc. Am."},{"key":"2_CR75","doi-asserted-by":"publisher","first-page":"532","DOI":"10.1109\/PROC.1976.10159","volume":"64","author":"F Jelinek","year":"1976","unstructured":"Jelinek, F.: Continuous speech recognition by statistical methods. Proc. IEEE 64, 532\u2013556 (1976)","journal-title":"Proc. IEEE"},{"key":"2_CR76","doi-asserted-by":"publisher","first-page":"1035","DOI":"10.1002\/j.1538-7305.1983.tb03114.x","volume":"62","author":"SE Levinson","year":"1983","unstructured":"Levinson, S.E., Rabiner, L.R., Sondhi, M.M.: An Introduction to the application of the theory of probabilistic functions of a markov process to automatic speech recognition. Bell Syst. Tech. J. 62, 1035\u20131074 (1983)","journal-title":"Bell Syst. Tech. J."},{"key":"2_CR77","doi-asserted-by":"publisher","first-page":"1235","DOI":"10.1002\/j.1538-7305.1985.tb00273.x","volume":"64","author":"B-H Juang","year":"1985","unstructured":"Juang, B.-H.: Maximum-likelihood estimation for mixture multivariate stochastic observations of markov chains. AT&T Tech. J. 64, 1235\u20131249 (1985)","journal-title":"AT&T Tech. J."},{"key":"2_CR78","doi-asserted-by":"publisher","first-page":"307","DOI":"10.1109\/TIT.1986.1057145","volume":"32","author":"B-H Juang","year":"1986","unstructured":"Juang, B.-H., Levinson, S., Sondhi, M.: Maximum likelihood estimation for multivariate mixture observations of Markov chains. IEEE Trans. on Inform. Theory 32, 307\u2013309 (1986)","journal-title":"IEEE Trans. on Inform. Theory"},{"key":"2_CR79","doi-asserted-by":"publisher","first-page":"599","DOI":"10.1109\/29.52701","volume":"38","author":"K-F Lee","year":"1990","unstructured":"Lee, K.-F.: Context-independent phonetic hidden Markov models for speaker-independent continuous speech recognition. IEEE Trans. Acoust. Speech Signal Process. 38, 599\u2013609 (1990)","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"key":"2_CR80","doi-asserted-by":"publisher","first-page":"369","DOI":"10.1006\/csla.1994.1019","volume":"8","author":"SJ Young","year":"1994","unstructured":"Young, S.J., Woodland, P.C.: State clustering in hidden Markov model-based continuous speech recognition. Comput. Speech Lang. 8, 369\u2013383 (1994)","journal-title":"Comput. Speech Lang."},{"key":"2_CR81","unstructured":"Mermelstein, P.: Distance measures for speech recognition, psychological and instrumental. Pattern Recogn. Artif. Intell. 374\u2013388 (1976)"},{"key":"2_CR82","doi-asserted-by":"publisher","first-page":"1738","DOI":"10.1121\/1.399423","volume":"87","author":"H Hermansky","year":"1990","unstructured":"Hermansky, H.: Perceptual linear predictive (PLP) analysis of speech. J. Acoust. Soc. Am. 87, 1738\u20131752 (1990)","journal-title":"J. Acoust. Soc. Am."},{"key":"2_CR83","doi-asserted-by":"publisher","first-page":"133","DOI":"10.1016\/S0167-6393(98)00033-8","volume":"25","author":"O Viikki","year":"1998","unstructured":"Viikki, O., Laurila, K.: Cepstral domain segmental feature vector normalization for noise robust speech recognition. Speech Commun. 25, 133\u2013147 (1998)","journal-title":"Speech Commun."},{"key":"2_CR84","doi-asserted-by":"crossref","unstructured":"Prasad, N.V., Umesh, S.: Improved cepstral mean and variance normalization using Bayesian framework. In: IEEE Workshop on Automatic Speech Recognition and Understanding, pp. 156\u2013161. IEEE, Olomouc, Czech Republic (2013)","DOI":"10.1109\/ASRU.2013.6707722"},{"key":"2_CR85","doi-asserted-by":"crossref","unstructured":"Rehr, R., Gerkmann, T.: Cepstral noise subtraction for robust automatic speech recognition. In: Proceedings of ICASSP, pp. 375\u2013378. IEEE, South Brisbane, Queensland, Australia (2015)","DOI":"10.1109\/ICASSP.2015.7177994"},{"key":"2_CR86","doi-asserted-by":"publisher","first-page":"578","DOI":"10.1109\/89.326616","volume":"2","author":"H Hermansky","year":"1994","unstructured":"Hermansky, H., Morgan, N.: RASTA processing of speech. IEEE Trans. on Speech Audio Processing 2, 578\u2013589 (1994)","journal-title":"IEEE Trans. on Speech Audio Processing"},{"key":"2_CR87","doi-asserted-by":"crossref","unstructured":"Bahl, L., Brown, P., De Souza, P., Mercer, R.: Maximum mutual information estimation of hidden Markov model parameters for speech recognition. In: Proceedings of ICASSP, pp. 49\u201352. IEEE, Tokyo, Japan (1986)","DOI":"10.1109\/ICASSP.1986.1169179"},{"key":"2_CR88","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1016\/S0167-6393(97)00029-0","volume":"22","author":"V Valtchev","year":"1997","unstructured":"Valtchev, V., Odell, J.J., Woodland, P.C., Young, S.J.: MMIE training of large vocabulary recognition systems. Speech Commun. 22, 303\u2013314 (1997)","journal-title":"Speech Commun."},{"key":"2_CR89","doi-asserted-by":"publisher","first-page":"257","DOI":"10.1109\/89.568732","volume":"5","author":"B-H Juang","year":"1997","unstructured":"Juang, B.-H., Hou, W., Lee, C.-H.: Minimum classification error rate methods for speech recognition. IEEE Trans. Speech Audio Process. 5, 257\u2013265 (1997)","journal-title":"IEEE Trans. Speech Audio Process."},{"key":"2_CR90","doi-asserted-by":"crossref","unstructured":"Povey, D., Woodland, P.C.: Minimum phone error and i-smoothing for improved discriminative training. In: Proceedings of ICASSP, pp. I-105-I\u2013108. IEEE, Orlando, FL, USA (2002)","DOI":"10.1109\/ICASSP.2002.5743665"},{"key":"2_CR91","unstructured":"Ng, A.Y., Jordan, M.I.: On discriminative vs. generative classifiers: a comparison of logistic regression and naive Bayes. In: Proceedings of the 14th International Conference on Neural Information Processing Systems: Natural and Synthetic, pp. 841\u2013848. MIT Press, Cambridge, MA, USA (2001)"},{"key":"2_CR92","unstructured":"Macherey, W.: Discriminative training and acoustic modeling for automatic speech recognition. Ph.D. Thesis, Aachen Techn. Hochsch (2010)"},{"key":"2_CR93","doi-asserted-by":"publisher","first-page":"24","DOI":"10.1109\/TASSP.1975.1162650","volume":"23","author":"J Baker","year":"1975","unstructured":"Baker, J.: The DRAGON system\u2013An overview. IEEE Trans. Acoust. Speech Signal Process. 23, 24\u201329 (1975)","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"key":"2_CR94","doi-asserted-by":"crossref","unstructured":"Bahl, L.R., Jelinek, F., Mercer, R.L.: A maximum likelihood approach to continuous speech recognition. IEEE Trans. Pattern Anal. Mach. Intell. PAMI-5, 179\u2013190 (1983)","DOI":"10.1109\/TPAMI.1983.4767370"},{"key":"2_CR95","doi-asserted-by":"publisher","first-page":"359","DOI":"10.1006\/csla.1999.0128","volume":"13","author":"SF Chen","year":"1999","unstructured":"Chen, S.F., Goodman, J.: An empirical study of smoothing techniques for language modeling. Comput. Speech Lang. 13, 359\u2013393 (1999)","journal-title":"Comput. Speech Lang."},{"key":"2_CR96","doi-asserted-by":"publisher","first-page":"403","DOI":"10.1006\/csla.2001.0174","volume":"15","author":"JT Goodman","year":"2001","unstructured":"Goodman, J.T.: A bit of progress in language modeling. Comput. Speech Lang. 15, 403\u2013434 (2001)","journal-title":"Comput. Speech Lang."},{"key":"2_CR97","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1162\/neco.1989.1.1.1","volume":"1","author":"RP Lippmann","year":"1989","unstructured":"Lippmann, R.P.: Review of neural networks for speech recognition. Neural Comput. 1, 1\u201338 (1989)","journal-title":"Neural Comput."},{"key":"2_CR98","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4615-3210-1","volume-title":"Connectionist Speech Recognition: a Hybrid Approach","author":"HA Bourlard","year":"1994","unstructured":"Bourlard, H.A., Morgan, N.: Connectionist Speech Recognition: a Hybrid Approach. Springer, US, Boston, MA (1994)"},{"key":"2_CR99","unstructured":"Mohamed, A., Dahl, G.E., Hinton, G.E.: Deep belief networks for phone recognition. In: NIPS Workshop on Deep Learning for Speech Recognition and Related Applications, pp. 1\u20139. Vancouver, BC, Canada (2009)"},{"key":"2_CR100","doi-asserted-by":"crossref","unstructured":"Dahl, G.E., Dong Yu, Li Deng, Acero, A.: Context-dependent pre-trained deep neural networks for large-vocabulary speech recognition. IEEE Trans. Audio, Speech, Lang. Process. 20, 30\u201342 (2012)","DOI":"10.1109\/TASL.2011.2134090"},{"key":"2_CR101","doi-asserted-by":"crossref","unstructured":"Graves, A., Fern\u00e1ndez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the International Conference on Machine Learning, pp. 369\u2013376. ACM Press, Pittsburgh, Pennsylvania (2006)","DOI":"10.1145\/1143844.1143891"},{"key":"2_CR102","doi-asserted-by":"crossref","unstructured":"Maas, A., Xie, Z., Jurafsky, D., Ng, A.: Lexicon-free conversational speech recognition with neural networks. In: Proceedings Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 345\u2013354. Denver, Colorado (2015)","DOI":"10.3115\/v1\/N15-1038"},{"key":"2_CR103","doi-asserted-by":"crossref","unstructured":"Bahdanau, D., Chorowski, J., Serdyuk, D., Brakel, P., Bengio, Y.: End-to-end attention-based large vocabulary speech recognition. In: Proceedings of ICASSP, pp. 4945\u20134949. Shanghai (2016)","DOI":"10.1109\/ICASSP.2016.7472618"},{"key":"2_CR104","doi-asserted-by":"crossref","unstructured":"Karita, S., et al.: A comparative study on transformer vs RNN in speech applications. In: Automatic speech recognition and understanding workshop (ASRU), pp. 449\u2013456. IEEE, SG, Singapore (2019)","DOI":"10.1109\/ASRU46091.2019.9003750"},{"key":"2_CR105","doi-asserted-by":"crossref","unstructured":"Zhu, H., Wang, L., Cheng, G., Wang, J., Zhang, P., Yan, Y.: Wav2vec-S: semi-supervised pre-training for low-resource ASR. In: Proceedings of the 23th INTERSPEECH, pp. 4870\u20134874. ISCA (2022)","DOI":"10.21437\/Interspeech.2022-909"},{"key":"2_CR106","doi-asserted-by":"crossref","unstructured":"Schneider, S., Baevski, A., Collobert, R., Auli, M.: wav2vec: Unsupervised pre-training for speech recognition. arXiv preprint arXiv:1904.05862 (2019)","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"2_CR107","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision. In: Proceedings of the International Conference on Machine Learning, pp. 28492\u201328518 (2023)"},{"issue":"2","key":"2_CR108","doi-asserted-by":"publisher","first-page":"109","DOI":"10.5937\/telfor1402109S","volume":"6","author":"S Suzi\u0107","year":"2014","unstructured":"Suzi\u0107, S., Ostrogonac, S., Pakoci, E., Bojani\u0107, M.: Building a speech repository for a Serbian LVCSR system. Telfor J. 6(2), 109\u2013114 (2014)","journal-title":"Telfor J."},{"key":"2_CR109","doi-asserted-by":"crossref","unstructured":"Nosek, T., Suzi\u0107, S., Deli\u0107, V., Se\u010dujski, M.: Cross-lingual text-to-speech with prosody embedding. In: Proceedings of IWSSIP, 5 pages (2023)","DOI":"10.1109\/IWSSIP58668.2023.10180259"},{"key":"2_CR110","doi-asserted-by":"crossref","unstructured":"Pakoci, E.T., Popovi\u0107, B.Z.: Recurrent neural networks and morphological features in language modeling for Serbian. In: 29th Telecommunication Forum (TELFOR), 8 pages. IEEE (2021)","DOI":"10.1109\/TELFOR52709.2021.9653410"},{"key":"2_CR111","doi-asserted-by":"crossref","unstructured":"Deli\u0107, V., Se\u010dujski, M., Sedlar, N.V., Mi\u0161kovi\u0107, D., Mak, R., Bojani\u0107, M.: How speech technologies can help people with disabilities. In: Ronzhin, A., Potapova, R., Deli\u0107, V. (eds.) 16th SPECOM 2014, LNAI, vol. 8773, pp. 243\u2013250. Springer. Novi Sad, Serbia (2014)","DOI":"10.1007\/978-3-319-11581-8_30"},{"key":"2_CR112","doi-asserted-by":"crossref","unstructured":"Deli\u0107, V., et al.: Central audio-library of the university of Novi Sad. In: Proceedings of the Intelligent Distributed Computing XIII, pp. 467\u2013476. Springer International Publishing (2020)","DOI":"10.1007\/978-3-030-32258-8_55"},{"key":"2_CR113","doi-asserted-by":"crossref","unstructured":"Pakoci, E., Pekar, D., Popovi\u0107, B., Se\u010dujski, M., Deli\u0107, V.: Overcoming data sparsity in automatic transcription of dictated medical findings. In: Proceedings of the 30th EUSIPCO, pp. 454\u2013458. IEEE (2022)","DOI":"10.23919\/EUSIPCO55093.2022.9909893"},{"key":"2_CR114","doi-asserted-by":"crossref","unstructured":"Popovi\u0107, B., Pakoci, E., Jakovljevi\u0107, N., Ko\u010di\u0161, G., Pekar, D.: Voice assistant application for the Serbian language. In: 23rd Telecommunication Forum (TELFOR), pp. 858\u2013861. IEEE (2015)","DOI":"10.1109\/TELFOR.2015.7377600"},{"key":"2_CR115","doi-asserted-by":"crossref","unstructured":"Reitmaier, T., et al: Opportunities and challenges of automatic speech recognition systems for low-resource language speakers. In Proceedings of the CHI Conference on Human Factors in Computing Systems, p. 17 (2022)","DOI":"10.1145\/3491102.3517639"},{"key":"2_CR116","unstructured":"Mu, Z., Yang, X., Dong, Y.: Review of end-to-end speech synthesis technology based on deep learning. arXiv preprint arXiv:2104.09995 (2021)"},{"key":"2_CR117","unstructured":"Ogayo, P., Neubig, G., Black, A.W.: Building TTS systems for low resource languages under resource constraints. In: Proceedings Speech for Social Good Workshop, p. 5 (2022)"},{"key":"2_CR118","doi-asserted-by":"crossref","unstructured":"Jimerson, R., Liu, Z., Prud\u2019Hommeaux, E.: An (unhelpful) guide to selecting the best ASR architecture for your under-resourced language. In: Proceedings of the 61st Annual Meeting of the Association for Comp. Linguistics (Vol. 2 Short Papers), pp. 1008\u20131016 (2023)","DOI":"10.18653\/v1\/2023.acl-short.87"},{"key":"2_CR119","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: Wav2vec 2.0: a framework for self-supervised learning of speech representations. Adv. Neural Inf. Process. Syst. 33, 12449\u201312460 (2020)"},{"issue":"2","key":"2_CR120","doi-asserted-by":"publisher","first-page":"110","DOI":"10.5937\/telfor2002110P","volume":"12","author":"BZ Popovi\u0107","year":"2020","unstructured":"Popovi\u0107, B.Z., Pakoci, E.T., Pekar, D.J.: Transfer learning for domain and environment adaptation in Serbian ASR. Telfor Journal 12(2), 110\u2013115 (2020)","journal-title":"Telfor Journal"},{"key":"2_CR121","unstructured":"Deli\u0107, V.D., Pekar, D.J., Se\u010dujski, M.S., Popovi\u0107, B.Z., Pakoci, E.T., Suzi\u0107, S.B.: Development of speech technology for Serbian and its applications. In: Proceedings of the First Serbian International Conference on Applied Artificial Intelligence, p. 7. Kragujevac, Serbia (2022)"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-77961-9_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,9]],"date-time":"2025-01-09T16:03:47Z","timestamp":1736438627000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-77961-9_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,22]]},"ISBN":["9783031779602","9783031779619"],"references-count":121,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-77961-9_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,22]]},"assertion":[{"value":"22 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"SPECOM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Speech and Computer","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Belgrade","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Serbia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25 November 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 November 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"specom2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/specom2024.ftn.uns.ac.rs\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}