{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T18:53:28Z","timestamp":1743015208451,"version":"3.40.3"},"publisher-location":"Cham","reference-count":26,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031646072"},{"type":"electronic","value":"9783031646089"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-64608-9_8","type":"book-chapter","created":{"date-parts":[[2024,7,1]],"date-time":"2024-07-01T05:01:59Z","timestamp":1719810119000},"page":"120-132","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Leveraging Wav2Vec2.0 for Kazakh Speech Recognition: An Experimental Study"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4235-9049","authenticated-orcid":false,"given":"Zhanibek","family":"Kozhirbayev","sequence":"first","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,7,2]]},"reference":[{"key":"8_CR1","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in neural information processing systems, vol. 30 (2017)"},{"key":"8_CR2","doi-asserted-by":"publisher","unstructured":"Karita, S., et al.: A comparative study on transformer vs rnn in speech applications. In: 2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp. 449\u2013456 (2019). https:\/\/doi.org\/10.1109\/ASRU46091.2019.9003750","DOI":"10.1109\/ASRU46091.2019.9003750"},{"key":"8_CR3","doi-asserted-by":"publisher","unstructured":"Nakatani, T.: Improving transformer-based end-to-end speech recognition with connectionist temporal classification and language model integration. In: Proceedings of Interspeech (2019). https:\/\/doi.org\/10.21437\/Interspeech.2019-1938","DOI":"10.21437\/Interspeech.2019-1938"},{"key":"8_CR4","doi-asserted-by":"publisher","unstructured":"Dong, L., Xu, S., Xu, B.: Speech-transformer: a no-recurrence sequence-to-sequence model for speech recognition. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5884\u20135888 (2018). https:\/\/doi.org\/ https:\/\/doi.org\/10.1109\/ICASSP.2018.8462506","DOI":"10.1109\/ICASSP.2018.8462506"},{"key":"8_CR5","unstructured":"Oord, A. V. D., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"8_CR6","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: A framework for self-supervised learning of speech representations. In: Advances in Neural Information Processing Systems, vol. 33, pp. 12449\u201312460 (2020)"},{"key":"8_CR7","doi-asserted-by":"publisher","unstructured":"K\u00fcrzinger, L., Winkelbauer, D., Li, L., Watzel, T., Rigoll, G.: CTC-segmentation of large corpora for german end-to-end speech recognition. In: Proceedings of Speech and Computer: 22nd International Conference, SPECOM 2020, pp. 267\u2013278 (2020). https:\/\/doi.org\/10.1007\/978-3-030-60276-5_27","DOI":"10.1007\/978-3-030-60276-5_27"},{"key":"8_CR8","unstructured":"Jiang, D., et al.: Improving transformer-based speech recognition using unsupervised pre-training, arXiv preprint arXiv:1910.09932 (2019)"},{"key":"8_CR9","doi-asserted-by":"publisher","unstructured":"Schneider, S., Baevski, A., Collobert, R., Auli, M: wav2vec: unsupervised pre-training for speech recognition. In: Proceedings of Interspeech, pp. 3465\u20133469 (2019). https:\/\/doi.org\/10.21437\/Interspeech.2019-1873","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"8_CR10","unstructured":"Baevski, A., Schneider, S., Auli, M.: vq-wav2vec: Self-supervised learning of discrete speech representations. arXiv preprint arXiv:1910.05453 (2019)"},{"key":"8_CR11","doi-asserted-by":"publisher","unstructured":"Conneau, A., Baevski, A., Collobert, R., Mohamed, A., Auli, M.: Unsupervised cross-lingual representation learning for speech recognition. In: Proceedings of Interspeech, pp. 2426\u20132430 (2021). https:\/\/doi.org\/10.21437\/Interspeech.2021-329","DOI":"10.21437\/Interspeech.2021-329"},{"key":"8_CR12","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 4171\u20134186 (2019). https:\/\/doi.org\/10.21437\/10.18653\/v1\/N19-1423","DOI":"10.21437\/10.18653\/v1\/N19-1423"},{"key":"8_CR13","unstructured":"Yessenbayev, Z., Karabalayeva, M., Shamayeva, F.: Large vocabulary continuous speech recognition for Kazakh. In: Proceedings of the I International Conference on Computer processing of Turkic Languages, Astana, pp. 217\u2013221 (2013)"},{"key":"8_CR14","doi-asserted-by":"publisher","unstructured":"Kozhirbayev, Z.: Kazakh speech recognition: Wav2vec 2.0 vs. Whisper. J. Adv. Inform. Technol. 14(6), 1382\u20131389 (2023). https:\/\/doi.org\/10.21437\/10.12720\/jait.14.6.1382-1389","DOI":"10.21437\/10.12720\/jait.14.6.1382-1389"},{"issue":"15","key":"8_CR15","doi-asserted-by":"publisher","first-page":"8900","DOI":"10.3390\/app13158900","volume":"13","author":"Z Kozhirbayev","year":"2023","unstructured":"Kozhirbayev, Z., Islamgozhayev, T.: Cascade speech translation for the kazakh language. Appl. Sci. 13(15), 8900 (2023). https:\/\/doi.org\/10.3390\/app13158900","journal-title":"Appl. Sci."},{"key":"8_CR16","doi-asserted-by":"publisher","unstructured":"Mamyrbayev, O., Oralbekova, D., Kydyrbekova, A., Turdalykyzy, T., Bekarystankyzy, A.: End-to-end model based on RNN-T for Kazakh speech recognition. In: 2021 3rd International Conference on Computer Communication and the Internet (ICCCI), pp. 163\u2013167 (2021). https:\/\/doi.org\/10.1109\/ICCCI51764.2021.9486811","DOI":"10.1109\/ICCCI51764.2021.9486811"},{"key":"8_CR17","doi-asserted-by":"publisher","unstructured":"Mamyrbayev, O., Oralbekova, D., Alimhan, K., Nuranbayeva, B.: Hybrid end-to-end model for Kazakh speech recognition. Inter. J. Speech Technol., 1\u201310 (2022). https:\/\/doi.org\/10.1007\/s10772-022-09983-8","DOI":"10.1007\/s10772-022-09983-8"},{"key":"8_CR18","doi-asserted-by":"publisher","unstructured":"Khomitsevich, O., Mendelev, V., Tomashenko, N., Rybin, S., Medennikov, I., Kudubayeva, S.: A bilingual Kazakh-Russian system for automatic speech recognition and synthesis. In: Proceedings of Speech and Computer: 17th International Conference, SPECOM 2015, Athens, Greece, 20\u201324 September, pp. 25\u201333 (2015). https:\/\/doi.org\/10.1007\/978-3-319-23132-7_3","DOI":"10.1007\/978-3-319-23132-7_3"},{"key":"8_CR19","doi-asserted-by":"publisher","unstructured":"Khassanov, Y., Mussakhojayeva, S., Mirzakhmetov, A., Adiyev, A., Nurpeiissov, M., Varol, H.: A crowdsourced open-source Kazakh speech corpus and initial speech recognition baseline. In: Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume, pp. 697\u2013706 (2021). https:\/\/doi.org\/10.18653\/v1\/2021.eacl-main.58","DOI":"10.18653\/v1\/2021.eacl-main.58"},{"key":"8_CR20","doi-asserted-by":"publisher","unstructured":"Mussakhojayeva, S., Khassanov, Y., Varol, H.: KSC2: an industrial-scale open-source Kazakh speech corpus. In: Proceedings of the INTERSPEECH, Incheon, Republic of Korea, pp. 18\u201322 (2015). https:\/\/doi.org\/10.21437\/Interspeech.2022-421","DOI":"10.21437\/Interspeech.2022-421"},{"issue":"2","key":"8_CR21","doi-asserted-by":"publisher","first-page":"870","DOI":"10.3390\/s23020870","volume":"23","author":"W Meng","year":"2023","unstructured":"Meng, W., Yolwas, N.: A study of speech recognition for kazakh based on unsupervised pre-training. Sensors 23(2), 870 (2023). https:\/\/doi.org\/10.3390\/s23020870","journal-title":"Sensors"},{"key":"8_CR22","unstructured":"Makhambetov, O., Makazhanov, A., Yessenbayev, Z., Matkarimov, B., Sabyrgaliyev, I., Sharafudinov, A.: Assembling the kazakh language corpus. In: Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing, pp. 1022\u20131031 (2013)"},{"key":"8_CR23","doi-asserted-by":"crossref","unstructured":"Watanabe, S., et al.: Espnet: End-to-end speech processing toolkit. arXiv preprint arXiv:1804.00015 (2018)","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"8_CR24","doi-asserted-by":"crossref","unstructured":"Ott, M., et al.: fairseq: a fast, extensible toolkit for sequence modeling. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics (Demonstrations), pp. 48\u201353 (2019)","DOI":"10.18653\/v1\/N19-4009"},{"key":"8_CR25","unstructured":"Myrzakhmetov, B., Kozhirbayev, Z.: Extended language modeling experiments for Kazakh. In: Proceedings of 2018 International Workshop on Computational Models in Language and Speech, p. 42 (2018)"},{"key":"8_CR26","unstructured":"Heafield, K.: KenLM: Faster and smaller language model queries. In: Proceedings of the Sixth Workshop on Statistical Machine Translation, pp. 187\u2013197 (2011)"}],"container-title":["Lecture Notes in Computer Science","Computational Science and Its Applications \u2013 ICCSA 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-64608-9_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,1]],"date-time":"2024-07-01T05:07:31Z","timestamp":1719810451000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-64608-9_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031646072","9783031646089"],"references-count":26,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-64608-9_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"2 July 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICCSA","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Computational Science and Its Applications","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hanoi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Vietnam","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 July 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 July 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iccsa2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}