{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,31]],"date-time":"2025-12-31T04:27:10Z","timestamp":1767155230385,"version":"3.37.3"},"reference-count":29,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2022,8,2]],"date-time":"2022-08-02T00:00:00Z","timestamp":1659398400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,8,2]],"date-time":"2022-08-02T00:00:00Z","timestamp":1659398400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2023,7]]},"DOI":"10.1007\/s10772-022-09983-8","type":"journal-article","created":{"date-parts":[[2022,8,2]],"date-time":"2022-08-02T17:12:20Z","timestamp":1659460340000},"page":"261-270","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":16,"title":["Hybrid end-to-end model for Kazakh speech recognition"],"prefix":"10.1007","volume":"26","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8318-3794","authenticated-orcid":false,"given":"Orken Zh.","family":"Mamyrbayev","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4975-6493","authenticated-orcid":false,"given":"Dina O.","family":"Oralbekova","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0766-2229","authenticated-orcid":false,"given":"Keylan","family":"Alimhan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3426-1914","authenticated-orcid":false,"given":"Bulbul M.","family":"Nuranbayeva","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,8,2]]},"reference":[{"key":"9983_CR1","doi-asserted-by":"publisher","DOI":"10.1049\/sil2.12057","author":"H Alsayadi","year":"2021","unstructured":"Alsayadi, H., Abdelhamid, A., Hegazy, I., & Fayed, Z. (2021). Arabic speech recognition using end-to-end deep learning. IET Signal Processing. https:\/\/doi.org\/10.1049\/sil2.12057","journal-title":"IET Signal Processing"},{"key":"9983_CR2","unstructured":"Amirgaliyev, N., Kuanyshbay, D., & Baimuratov, O. (2020). Development of automatic speech recognition for Kazakh language using transfer learning. Speech recognition for Kazakh language project."},{"key":"9983_CR3","doi-asserted-by":"publisher","first-page":"EL221","DOI":"10.1121\/1.3124659","volume":"125","author":"J Brown","year":"2009","unstructured":"Brown, J., & Smaragdis, P. (2009). Hidden Markov and Gaussian mixture models for automatic call classification. The Journal of the Acoustical Society of America, 125, EL221\u2013EL224. https:\/\/doi.org\/10.1121\/1.3124659","journal-title":"The Journal of the Acoustical Society of America"},{"key":"9983_CR4","doi-asserted-by":"publisher","unstructured":"Chan, W., Jaitly, N., Le, Q., & Vinyals, O. (2016b). Listen, attend and spell: A neural network for large vocabulary conversational speech recognition. In 2016b IEEE international conference on acoustics, speech and signal processing (ICASSP), Shanghai, China, 2016 (pp. 4960\u20134964). https:\/\/doi.org\/10.1109\/ICASSP.2016.7472621.","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"9983_CR5","doi-asserted-by":"publisher","unstructured":"Chan, W., Jaitly, N., Le, Q., & Vinyals, O. (2016a). Listen, attend and spell: A neural network for large vocabulary conversational speech recognition. In IEEE international conference on acoustics, speech and signal processing (ICASSP), Shanghai (pp. 4960\u20134964). https:\/\/doi.org\/10.1109\/ICASSP.2016a.7472621.","DOI":"10.1109\/ICASSP.2016a.7472621"},{"key":"9983_CR6","doi-asserted-by":"publisher","DOI":"10.1017\/ATSIP.2020.23","author":"J Chen","year":"2020","unstructured":"Chen, J., Nishimura, R., & Kitaoka, N. (2020). End-to-end recognition of streaming Japanese speech using CTC and local attention. APSIPA Transactions on Signal and Information Processing. https:\/\/doi.org\/10.1017\/ATSIP.2020.23","journal-title":"APSIPA Transactions on Signal and Information Processing"},{"key":"9983_CR7","doi-asserted-by":"publisher","first-page":"62","DOI":"10.3390\/info12020062","volume":"12","author":"E Emiru","year":"2021","unstructured":"Emiru, E., Li, Y., Fesseha, A., & Diallo, M. (2021). Improving Amharic Speech Recognition System using connectionist temporal classification with attention model and phoneme-based byte-pair-encodings. Information, 12, 62. https:\/\/doi.org\/10.3390\/info12020062","journal-title":"Information"},{"key":"9983_CR8","doi-asserted-by":"publisher","unstructured":"Graves, A., Fern\u00e1ndez, S., Gomez, F., & Schmidhuber, J. (2006). Connectionist temporal classification: Labelling unsegmented sequence data with recurrent neural 'networks. In ICML 2006\u2014Proceedings of the 23rd international conference on machine learning, 2006 (pp. 369\u2013376). https:\/\/doi.org\/10.1145\/1143844.1143891.","DOI":"10.1145\/1143844.1143891"},{"issue":"6","key":"9983_CR9","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1109\/MSP.2012.2205597","volume":"29","author":"G Hinton","year":"2012","unstructured":"Hinton, G., Deng, L., Yu, D., Dahl, G., Mohamed, A.-R., Jaitly, N., Senior, A., Vanhoucke, V., Nguyen, P., Sainath, T., & Kingsbury, B. (2012). Deep neural networks for acoustic modeling in speech recognition. IEEE Signal Processing Magazine, 29(6), 82\u201397.","journal-title":"IEEE Signal Processing Magazine"},{"key":"9983_CR10","doi-asserted-by":"crossref","unstructured":"Hori, T., Watanabe, S., Zhang, Y., & Chan, W. (2017). Advances in Joint CTC\u2013attention based end-to-end speech recognition with a deep CNN encoder and RNN-LM. In INTERSPEECH 2017.","DOI":"10.21437\/Interspeech.2017-1296"},{"key":"9983_CR11","doi-asserted-by":"crossref","unstructured":"Ignatenko, G. S., & Lamchanovsky, A. G. (2019). Classification of audio signals using neural networks. Young Scientist, 48(286), 23\u201325. Retrieved 07\/02\/2022, from https:\/\/moluch.ru\/archive\/286\/64455\/.","DOI":"10.1055\/a-0902-6882"},{"issue":"3","key":"9983_CR12","doi-asserted-by":"publisher","first-page":"251","DOI":"10.2307\/1268779","volume":"33","author":"B Juang","year":"1991","unstructured":"Juang, B., & Rabiner, L. (1991). Hidden Markov models for speech recognition. Technometrics, 33(3), 251\u2013272. https:\/\/doi.org\/10.2307\/1268779","journal-title":"Technometrics"},{"key":"9983_CR13","doi-asserted-by":"crossref","unstructured":"Keren, G., & Schuller, B. (2016). Convolutional RNN: An enhanced model for extracting features from sequential data. In Proceedings of the international joint conference on neural networks, 2016 (pp. 3412\u20133419).","DOI":"10.1109\/IJCNN.2016.7727636"},{"key":"9983_CR14","doi-asserted-by":"crossref","unstructured":"Kim, S., Hori, T., & Watanabe, S. (2016). Joint CTC\u2013attention based end-to-end speech recognition using multi-task learning. In ICASSP 2017.","DOI":"10.1109\/ICASSP.2017.7953075"},{"key":"9983_CR15","unstructured":"Kingma, D., & Ba, J. (2014). Adam: a method for stochastic optimization. In Proceedings of the 3rd international conference for learning representation, 2014."},{"key":"9983_CR16","first-page":"707","volume":"10","author":"VI Levenshtein","year":"1996","unstructured":"Levenshtein, V. I. (1996). Binary codes capable of correcting deletions, insertions, and reversals. Soviet Physics Doklady, 10, 707\u2013710.","journal-title":"Soviet Physics Doklady"},{"issue":"9 (115)","key":"9983_CR17","doi-asserted-by":"publisher","first-page":"84","DOI":"10.15587\/1729-4061.2022.252801","volume":"1","author":"O Mamyrbayev","year":"2022","unstructured":"Mamyrbayev, O., Alimhan, K., Oralbekova, D., Bekarystankyzy, A., & Zhumazhanov, B. (2022). Identifying the influence of transfer learning method in developing an end-to-end automatic speech recognition system with a low data level. Eastern-European Journal of Enterprise Technologies, 1(9 (115)), 84\u201392. https:\/\/doi.org\/10.15587\/1729-4061.2022.252801","journal-title":"Eastern-European Journal of Enterprise Technologies"},{"issue":"9 (112)","key":"9983_CR18","doi-asserted-by":"publisher","first-page":"32","DOI":"10.15587\/1729-4061.2021.239186","volume":"4","author":"O Mamyrbayev","year":"2021","unstructured":"Mamyrbayev, O., Kydyrbekova, A., Alimhan, K., Oralbekova, D., Zhumazhanov, B., & Nuranbayeva, B. (2021). Development of security systems using DNN and i & x-vector classifiers. Eastern-European Journal of Enterprise Technologies, 4(9 (112)), 32\u201345. https:\/\/doi.org\/10.15587\/1729-4061.2021.239186","journal-title":"Eastern-European Journal of Enterprise Technologies"},{"issue":"3320","key":"9983_CR19","doi-asserted-by":"publisher","first-page":"42","DOI":"10.32014\/2020.2518-1726.64","volume":"4","author":"O Mamyrbayev","year":"2020","unstructured":"Mamyrbayev, O., & Oralbekova, D. (2020). Modern trends in the development of speech recognition systems. News of the National Academy of Sciences of the Republic of Kazakhstan, 4(3320), 42\u201351. https:\/\/doi.org\/10.32014\/2020.2518-1726.64","journal-title":"News of the National Academy of Sciences of the Republic of Kazakhstan"},{"key":"9983_CR20","doi-asserted-by":"publisher","first-page":"01012","DOI":"10.1051\/itmconf\/20192401012","volume":"24","author":"O Mamyrbayev","year":"2019","unstructured":"Mamyrbayev, O., Turdalyuly, M., Mekebayev, N., Kuralai, M., Alimhan, K., BabaAli, B., Nabieva, G., Duisenbayeva, A., & Akhmetov, B. (2019). Continuous speech recognition of Kazakh language. ITM Web of Conferences, 24, 01012. https:\/\/doi.org\/10.1051\/itmconf\/20192401012","journal-title":"ITM Web of Conferences"},{"key":"9983_CR21","doi-asserted-by":"publisher","unstructured":"Mamyrbayev, O., Oralbekova, D., Kydyrbekova, A., Turdalykyzy, T., & Bekarystankyzy, A. (2021a). End-to-end model based on RNN-T for Kazakh speech recognition. In 2021a 3rd International conference on computer communication and the Internet (ICCCI), 2021 (pp. 163\u2013167). https:\/\/doi.org\/10.1109\/ICCCI51764.2021.9486811.","DOI":"10.1109\/ICCCI51764.2021.9486811"},{"key":"9983_CR22","doi-asserted-by":"publisher","unstructured":"Mamyrbayev, O., Alimhan, K., & Zhumazhanov, B., Turdalykyzy, T., & Gusmanova, F. (2020). End-to-end speech recognition in agglutinative languages.https:\/\/doi.org\/10.1007\/978-3-030-42058-1_33","DOI":"10.1007\/978-3-030-42058-1_33"},{"key":"9983_CR23","doi-asserted-by":"publisher","unstructured":"Miao, H., Cheng, G., Zhang, P., & Li, T., & Yan, Y. (2019). Online hybrid CTC\/attention architecture for end-to-end speech recognition. In INTERSPEECH 2019 (pp. 2623\u20132627). https:\/\/doi.org\/10.21437\/Interspeech.2019-2018.","DOI":"10.21437\/Interspeech.2019-2018"},{"key":"9983_CR24","doi-asserted-by":"publisher","DOI":"10.1088\/1742-6596\/1549\/5\/052034","volume":"1549","author":"M Nie","year":"2020","unstructured":"Nie, M., & Lei, Z. (2020). Hybrid CTC\/attention architecture with self-attention and convolution hybrid encoder for speech recognition. Journal of Physics: Conference Series, 1549, 052034. https:\/\/doi.org\/10.1088\/1742-6596\/1549\/5\/052034","journal-title":"Journal of Physics: Conference Series"},{"key":"9983_CR25","doi-asserted-by":"publisher","unstructured":"Park H., Seo S., Sogang, D., Rim J., Kim C., Son H., Park, J., & Kim J. (2019). Korean grapheme unit-based speech recognition using attention\u2013CTC ensemble network. In 2019 International symposium on multimedia and communication technology (ISMAC), 2019 (pp. 1\u20135). https:\/\/doi.org\/10.1109\/ISMAC.2019.8836146","DOI":"10.1109\/ISMAC.2019.8836146"},{"issue":"8","key":"9983_CR26","doi-asserted-by":"publisher","first-page":"1240","DOI":"10.1109\/JSTSP.2017.2763455","volume":"11","author":"S Watanabe","year":"2017","unstructured":"Watanabe, S., Hori, T., Kim, S., Hershey, J. R., & Hayashi, T. (2017). Hybrid CTC\/attention architecture for end-to-end speech recognition. IEEE Journal of Selected Topics in Signal Processing, 11(8), 1240\u20131253. https:\/\/doi.org\/10.1109\/JSTSP.2017.2763455","journal-title":"IEEE Journal of Selected Topics in Signal Processing"},{"key":"9983_CR27","doi-asserted-by":"publisher","first-page":"4639","DOI":"10.3390\/app9214639","volume":"9","author":"L Wu","year":"2019","unstructured":"Wu, L., Li, T., Wang, L., & Yan, Y. (2019). Improving hybrid CTC\/attention architecture with time-restricted self-attention CTC for end-to-end speech recognition. Applied Sciences, 9, 4639. https:\/\/doi.org\/10.3390\/app9214639","journal-title":"Applied Sciences"},{"key":"9983_CR28","doi-asserted-by":"publisher","unstructured":"Zeyer, A., Irie, K., Schl\u00fcter, R., & Ney, H. (2018). Improved training of end-to-end attention models for speech recognition. In INTERSPEECH 2018 (pp. 7\u201311). https:\/\/doi.org\/10.21437\/Interspeech.2018-1616.","DOI":"10.21437\/Interspeech.2018-1616"},{"key":"9983_CR29","doi-asserted-by":"publisher","unstructured":"Zweig, G., & Nguyen, P. (2009). A segmental CRF approach to large vocabulary continuous speech recognition. In IEEE workshop on automatic speech recognition and understanding (pp. 152\u2013157). https:\/\/doi.org\/10.1109\/ASRU.2009.5372916.","DOI":"10.1109\/ASRU.2009.5372916"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-022-09983-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10772-022-09983-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-022-09983-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,7,31]],"date-time":"2023-07-31T11:14:43Z","timestamp":1690802083000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10772-022-09983-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,2]]},"references-count":29,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2023,7]]}},"alternative-id":["9983"],"URL":"https:\/\/doi.org\/10.1007\/s10772-022-09983-8","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"type":"print","value":"1381-2416"},{"type":"electronic","value":"1572-8110"}],"subject":[],"published":{"date-parts":[[2022,8,2]]},"assertion":[{"value":"30 July 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 June 2022","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 August 2022","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}