{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,6]],"date-time":"2025-10-06T18:10:34Z","timestamp":1759774234719,"version":"3.37.3"},"reference-count":29,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2021,4,15]],"date-time":"2021-04-15T00:00:00Z","timestamp":1618444800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,4,15]],"date-time":"2021-04-15T00:00:00Z","timestamp":1618444800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100006769","name":"Russian Science Foundation","doi-asserted-by":"publisher","award":["18-18-00063"],"award-info":[{"award-number":["18-18-00063"]}],"id":[{"id":"10.13039\/501100006769","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2021,9]]},"DOI":"10.1007\/s10772-021-09840-0","type":"journal-article","created":{"date-parts":[[2021,4,15]],"date-time":"2021-04-15T15:20:11Z","timestamp":1618500011000},"page":"729-735","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["GAN acoustic model for Kazakh speech synthesis"],"prefix":"10.1007","volume":"24","author":[{"given":"Arman","family":"Kaliyev","sequence":"first","affiliation":[]},{"given":"Bassel","family":"Zeno","sequence":"additional","affiliation":[]},{"given":"Sergey V.","family":"Rybin","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7010-1585","authenticated-orcid":false,"given":"Yuri N.","family":"Matveev","sequence":"additional","affiliation":[]},{"given":"Elena E.","family":"Lyakso","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,4,15]]},"reference":[{"key":"9840_CR1","unstructured":"Berment, V. (2004). Methods to computerize \u201clittle equipped\u201d languages and groups of languages. Theses: Universit\u00e9 Joseph-Fourier - Grenoble I."},{"key":"9840_CR2","unstructured":"Bollepalli, B., Juvela, L., & Alku, P. (2019). Generative adversarial network-based glottal waveform model for statistical parametric speech synthesis. arXiv e-prints, p.\u00a0arXiv:1903.05955."},{"key":"9840_CR3","unstructured":"Goodfellow, I., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., Courville, A., & Bengio, Y. (2014). Generative adversarial nets. in Advances in Neural Information Processing Systems 27 (Z.\u00a0Ghahramani, M.\u00a0Welling, C.\u00a0Cortes, N.\u00a0D. Lawrence, and K.\u00a0Q. Weinberger, eds.), pp.\u00a02672\u20132680, Curran Associates, Inc."},{"key":"9840_CR4","doi-asserted-by":"crossref","unstructured":"Han, J., Zhang, Z., Ren, Z., Ringeval, F., & Schuller, B.\u00a0W. (2018). Towards conditional adversarial training for predicting emotions from speech. 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a06822\u20136826.","DOI":"10.1109\/ICASSP.2018.8462579"},{"key":"9840_CR5","doi-asserted-by":"publisher","first-page":"274","DOI":"10.1007\/978-3-319-99579-3_29","volume-title":"Speech and computer","author":"A Kaliyev","year":"2018","unstructured":"Kaliyev, A., Rybin, S. V., & Matveev, Y. N. (2018). Phoneme duration prediction for Kazakh language. In A. Karpov, O. Jokisch, & R. Potapova (Eds.), Speech and computer (pp. 274\u2013280). Cham: Springer International Publishing."},{"key":"9840_CR6","doi-asserted-by":"publisher","first-page":"741","DOI":"10.1007\/978-3-319-66429-3_74","volume-title":"Speech and computer","author":"A Kaliyev","year":"2017","unstructured":"Kaliyev, A., Rybin, S. V., & Matveev, Y. N. (2017). The pausing method based on brown clustering and word embedding. In A. Karpov, R. Potapova, & I. Mporas (Eds.), Speech and computer (pp. 741\u2013747). Cham: Springer International Publishing."},{"key":"9840_CR7","doi-asserted-by":"crossref","unstructured":"Kaliyev, A., Matveev, Y.\u00a0N., Lyakso, E.\u00a0E., & Rybin, S.\u00a0V. (2018). Prosodic processing for the automatic synthesis of emotional russian speech. in 2018 IEEE International Conference \u201cQuality Management, Transport and Information Security, Information Technologies\u201d (IT QM IS), Proceedings of the 2018 International Conference \u201dQuality Management, Transport and Information Security, Information Technologies\u201d, IT and QM and IS 2018, (United States), pp.\u00a0653\u2013655, Institute of Electrical and Electronics Engineers Inc.","DOI":"10.1109\/ITMQIS.2018.8525072"},{"key":"9840_CR8","doi-asserted-by":"crossref","unstructured":"Kaliyev, A., Rybin, S.\u00a0V., Matveev, Y.\u00a0N., Kaziyeva, N., & Burambayeva, N. (2018). \u201cModeling pause for the synthesis of kazakh speech,\u201d in Proceedings of the Fourth International Conference on Engineering & MIS 2018, ICEMIS \u201918, (New York, NY, USA), pp.\u00a01:1\u20131:4, ACM.","DOI":"10.1145\/3234698.3234699"},{"key":"9840_CR9","first-page":"117","volume":"20162015","author":"A Karpov","year":"2015","unstructured":"Karpov, A., & Verkhodanova, V. (2015). Speech technologies for under-resourced languages of the world. Voprosy Jazykoznanija, 20162015, 117\u2013135.","journal-title":"Voprosy Jazykoznanija"},{"key":"9840_CR10","doi-asserted-by":"publisher","first-page":"25","DOI":"10.1007\/978-3-319-23132-7_3","volume-title":"Speech and computer","author":"O Khomitsevich","year":"2015","unstructured":"Khomitsevich, O., Mendelev, V., Tomashenko, N., Rybin, S., Medennikov, I., & Kudubayeva, S. (2015). A bilingual Kazakh\u2013Russian system for automatic speech recognition and synthesis. In A. Ronzhin, R. Potapova, & N. Fakotakis (Eds.), Speech and computer (pp. 25\u201333). Cham: Springer International Publishing."},{"key":"9840_CR11","first-page":"8","volume":"2003","author":"S Krauwer","year":"2003","unstructured":"Krauwer, S. (2003). The basic language resource kit (blark) as the first milestone for the language resources roadmap. Proceedings of SPECOM, 2003, 8\u201315.","journal-title":"Proceedings of SPECOM"},{"key":"9840_CR12","unstructured":"Kumar, K., Kumar, R., de\u00a0Boissiere, T., Gestin, L., Teoh, W.\u00a0Z., Sotelo, J., de\u00a0Br\u00e9bisson, A., Bengio, Y., & Courville, A.\u00a0C. (2019). MelGAN: Generative adversarial networks for conditional waveform synthesis. in Advances in Neural Information Processing Systems, vol.\u00a032, Curran Associates, Inc."},{"key":"9840_CR13","doi-asserted-by":"crossref","unstructured":"Liu, B., Nie, S., Zhang, Y., Ke, D., Liang, S., & Liu, W. (2018). Boosting noise robustness of acoustic model via deep adversarial training. CoRR, vol.\u00a0abs\/1805.01357.","DOI":"10.1109\/ICASSP.2018.8462093"},{"key":"9840_CR14","unstructured":"Ma, S., Mcduff, D., & Song, Y. (2019). A generative adversarial network for style modeling in a text-to-speech system. in International Conference on Learning Representations, vol.\u00a02."},{"key":"9840_CR15","first-page":"3194","volume":"9","author":"AN Mon","year":"2019","unstructured":"Mon, A. N., Pa, W. P., & Thu, Y. K. (2019). Ucsy-sc1: A myanmar speech corpus for automatic speech recognition. International Journal of Electrical and Computer Engineering, 9, 3194\u20133202.","journal-title":"International Journal of Electrical and Computer Engineering"},{"key":"9840_CR16","doi-asserted-by":"publisher","first-page":"57","DOI":"10.1016\/j.specom.2016.09.001","volume":"84","author":"M Morise","year":"2016","unstructured":"Morise, M. (2016). D4c, a band-aperiodicity estimator for high-quality speech synthesis. Speech Communication, 84, 57\u201365.","journal-title":"Speech Communication"},{"key":"9840_CR17","doi-asserted-by":"publisher","first-page":"1877","DOI":"10.1587\/transinf.2015EDP7457","volume":"E99","author":"M Morise","year":"2016","unstructured":"Morise, M., Yokomori, F., & Ozawa, K. (2016). World: A vocoder-based high-quality speech synthesis system for real-time applications. IEICE Transactions on Information and Systems, E99, 1877\u20131884.","journal-title":"IEICE Transactions on Information and Systems"},{"key":"9840_CR18","doi-asserted-by":"publisher","first-page":"1123","DOI":"10.1007\/s10772-019-09652-3","volume":"22","author":"V Passricha","year":"2019","unstructured":"Passricha, V., & Aggarwal, R. K. (2019). PSO-based optimized CNN for Hindi ASR. International Journal of Speech Technology, 22, 1123\u20131133.","journal-title":"International Journal of Speech Technology"},{"key":"9840_CR19","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1109\/TASLP.2017.2761547","volume":"26","author":"Y Saito","year":"2018","unstructured":"Saito, Y., Takamichi, S., & Saruwatari, H. (2018). Statistical parametric speech synthesis incorporating generative adversarial networks. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 26, 84\u201396.","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"9840_CR20","unstructured":"Skerry-Ryan, R.\u00a0J., Battenberg, E., Xiao, Y., Wang, Y., Stanton, D., Shor, J., Weiss, R.\u00a0J., Clark, R., & Saurous, R.\u00a0A. (2018). Towards end-to-end prosody transfer for expressive speech synthesis with tacotron. CoRR, vol.\u00a0abs\/1803.09047."},{"key":"9840_CR21","unstructured":"Sotelo, J., Mehri, Soroush., Kumar, K., Santos, J.\u00a0F., Kastner, K., Courville, A., & Bengio, Y. (2017). Char2wav: End-to-end speech synthesis. in International Conference on Learning Representations (Workshop Track), pp.\u00a01\u20136."},{"key":"9840_CR22","doi-asserted-by":"publisher","first-page":"931","DOI":"10.1007\/s10772-018-9551-4","volume":"21","author":"L Sun","year":"2018","unstructured":"Sun, L., Chen, J., Xie, K., & Gu, T. (2018). Deep and shallow features fusion based on deep convolutional neural network for speech emotion recognition. International Journal of Speech Technology, 21, 931\u2013940.","journal-title":"International Journal of Speech Technology"},{"key":"9840_CR23","unstructured":"Taigman, Y., Wolf, L., Polyak, A., & Nachmani, E. (2017). Voice synthesis for in-the-wild speakers via a phonological loop. CoRR, vol.\u00a0abs\/1707.06588."},{"key":"9840_CR24","doi-asserted-by":"crossref","unstructured":"Yamamoto, R., Song, E., & Kim, J. (2020). Parallel wavegan: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram. in ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a06199\u20136203.","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"9840_CR25","doi-asserted-by":"crossref","unstructured":"Yang, S., Xie, L., Chen, X., Lou, X., Zhu, X., Huang, D., & Li, H. (2017). Statistical parametric speech synthesis using generative adversarial networks under a multi-task learning framework. in 2017 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp.\u00a0685\u2013691.","DOI":"10.1109\/ASRU.2017.8269003"},{"key":"9840_CR26","doi-asserted-by":"crossref","unstructured":"Yang, J., Lee, J., Kim, Y., Cho, H.-Y., & Kim, I. (2020). VocGAN: A high-fidelity real-time vocoder with a hierarchically-nested adversarial network. in Proc. Interspeech, pp.\u00a0200\u2013204.","DOI":"10.21437\/Interspeech.2020-1238"},{"key":"9840_CR27","doi-asserted-by":"crossref","unstructured":"Yang, G., Yang, S., Liu, K., Fang, P., Chen, W., & Xie, L. (2020). Multi-band MelGAN: Faster waveform generation for high-quality text-to-speech. CoRR, vol.\u00a0abs\/2005.05106.","DOI":"10.1109\/SLT48900.2021.9383551"},{"key":"9840_CR28","doi-asserted-by":"publisher","first-page":"60478","DOI":"10.1109\/ACCESS.2018.2872060","volume":"6","author":"Y Zhao","year":"2018","unstructured":"Zhao, Y., Takaki, S., Luong, H., Yamagishi, J., Saito, D., & Minematsu, N. (2018). Wasserstein gan and waveform loss-based acoustic model training for multi-speaker text-to-speech synthesis systems using a wavenet vocoder. IEEE Access, 6, 60478\u201360488.","journal-title":"IEEE Access"},{"key":"9840_CR29","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1007\/s10772-018-09573-7","volume":"22","author":"T Zia","year":"2019","unstructured":"Zia, T., & Zahid, U. (2019). Long short-term memory recurrent neural network architectures for Urdu acoustic modeling. International Journal of Speech Technology, 22, 21\u201330.","journal-title":"International Journal of Speech Technology"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-021-09840-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10772-021-09840-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-021-09840-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,8,17]],"date-time":"2021-08-17T11:31:31Z","timestamp":1629199891000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10772-021-09840-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,4,15]]},"references-count":29,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2021,9]]}},"alternative-id":["9840"],"URL":"https:\/\/doi.org\/10.1007\/s10772-021-09840-0","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"type":"print","value":"1381-2416"},{"type":"electronic","value":"1572-8110"}],"subject":[],"published":{"date-parts":[[2021,4,15]]},"assertion":[{"value":"24 April 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 March 2021","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 April 2021","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}