{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T17:43:17Z","timestamp":1742924597625,"version":"3.40.3"},"publisher-location":"Cham","reference-count":33,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030602758"},{"type":"electronic","value":"9783030602765"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-60276-5_4","type":"book-chapter","created":{"date-parts":[[2020,10,4]],"date-time":"2020-10-04T07:02:44Z","timestamp":1601794964000},"page":"35-44","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Exploration of End-to-End ASR for OpenSTT \u2013 Russian Open Speech-to-Text Dataset"],"prefix":"10.1007","author":[{"given":"Andrei","family":"Andrusenko","sequence":"first","affiliation":[]},{"given":"Aleksandr","family":"Laptev","sequence":"additional","affiliation":[]},{"given":"Ivan","family":"Medennikov","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,9,29]]},"reference":[{"key":"4_CR1","unstructured":"Andrusenko, A., Laptev, A., Medennikov, I.: Towards a competitive end-to-end speech recognition for chime-6 dinner party transcription. arXiv preprint arXiv:2004.10799 (2020). https:\/\/arxiv.org\/abs\/2004.10799v2"},{"key":"4_CR2","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. In: 3rd International Conference on Learning Representations, ICLR, May 2015"},{"key":"4_CR3","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/978-3-319-99579-3_4","volume-title":"Speech and Computer","author":"V Bataev","year":"2018","unstructured":"Bataev, V., Korenevsky, M., Medennikov, I., Zatvornitskiy, A.: Exploring end-to-end techniques for low-resource speech recognition. In: Karpov, A., Jokisch, O., Potapova, R. (eds.) SPECOM 2018. LNCS (LNAI), vol. 11096, pp. 32\u201341. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-319-99579-3_4"},{"key":"4_CR4","unstructured":"Boyer, F., Rouas, J.L.: End-to-end speech recognition: a review for the French language. arXiv preprint arXiv:1910.08502 (2019). http:\/\/arxiv.org\/abs\/1910.08502"},{"key":"4_CR5","doi-asserted-by":"publisher","unstructured":"Chan, W., Jaitly, N., Le, Q.V., Vinyals, O.: Listen, attend and spell: a neural network for large vocabulary conversational speech recognition. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4960\u20134964. IEEE (2016). https:\/\/doi.org\/10.1109\/ICASSP.2016.7472621","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"4_CR6","unstructured":"Denisov, P.: Espnet recipe results for Russian open speech to text (2019). https:\/\/github.com\/espnet\/espnet\/blob\/master\/egs\/ru_open_stt\/asr1\/RESULTS.md"},{"key":"4_CR7","unstructured":"Graves, A.: Sequence transduction with recurrent neural networks. In: Proceedings of the 29th International Conference on Machine Learning (2012)"},{"key":"4_CR8","doi-asserted-by":"publisher","unstructured":"Graves, A., Fern\u00e1ndez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd International Conference on Machine Learning - ICML, pp. 369\u2013376. ACM Press (2006). https:\/\/doi.org\/10.1145\/1143844.1143891","DOI":"10.1145\/1143844.1143891"},{"key":"4_CR9","doi-asserted-by":"publisher","unstructured":"Hinton, G., Deng, l., Yu, D., Dahl, G., et al.: Deep neural networks for acoustic modeling in speech recognition: the shared views of four research groups. In: Signal Processing Magazine, IEEE, pp. 82\u201397, November 2012. https:\/\/doi.org\/10.1109\/MSP.2012.2205597","DOI":"10.1109\/MSP.2012.2205597"},{"key":"4_CR10","unstructured":"Iakushkin, O., Fedoseev, G., Shaleva, A., Degtyarev, A., Sedova, O.: Russian-language speech recognition system based on deepspeech. In: Proceedings of the VIII International Conference on Distributed Computing and Grid-technologies in Science and Education (GRID 2018), September 2018. https:\/\/github.com\/GeorgeFedoseev\/DeepSpeech"},{"key":"4_CR11","doi-asserted-by":"crossref","unstructured":"Karita, S., Wang, X., Watanabe, S., Yoshimura, T., et al.: A comparative study on transformer vs RNN in speech applications. In: IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp. 449\u2013456. IEEE, December 2019","DOI":"10.1109\/ASRU46091.2019.9003750"},{"key":"4_CR12","doi-asserted-by":"publisher","unstructured":"Kim, S., Hori, T., Watanabe, S.: Joint CTC-attention based end-to-end speech recognition using multi-task learning. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4835\u20134839. IEEE, March 2017. https:\/\/doi.org\/10.1109\/ICASSP.2017.7953075","DOI":"10.1109\/ICASSP.2017.7953075"},{"key":"4_CR13","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"246","DOI":"10.1007\/978-3-319-43958-7_29","volume-title":"Speech and Computer","author":"I Kipyatkova","year":"2016","unstructured":"Kipyatkova, I., Karpov, A.: DNN-based acoustic modeling for Russian speech recognition using kaldi. In: Ronzhin, A., Potapova, R., N\u00e9meth, G. (eds.) SPECOM 2016. LNCS (LNAI), vol. 9811, pp. 246\u2013253. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-43958-7_29"},{"key":"4_CR14","doi-asserted-by":"crossref","unstructured":"Kudo, T., Richardson, J.: SentencePiece: a simple and language independent subword tokenizer and detokenizer for Neural Text Processing. In: Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pp. 66\u201371 (2018). https:\/\/github.com\/google\/sentencepiece","DOI":"10.18653\/v1\/D18-2012"},{"key":"4_CR15","doi-asserted-by":"crossref","unstructured":"Laptev, A., Korostik, R., Svischev, A., Andrusenko, A., et al.: You do not need more data: improving end-to-end speech recognition by text-to-speech data augmentation. arXiv preprint arXiv:2005.07157 (2020)","DOI":"10.1109\/CISP-BMEI51763.2020.9263564"},{"key":"4_CR16","doi-asserted-by":"publisher","unstructured":"Li, J., Lavrukhin, V., Ginsburg, B., Leary, R., et al.: Jasper: an end-to-end convolutional neural acoustic model. In: Interspeech 2019, pp. 71\u201375. ISCA, September 2019. https:\/\/doi.org\/10.21437\/interspeech.2019-1819","DOI":"10.21437\/interspeech.2019-1819"},{"key":"4_CR17","doi-asserted-by":"crossref","unstructured":"L\u00fcscher, C., Beck, E., Irie, K., Kitza, M., et al.: RWTH ASR systems for LibriSpeech: hybrid vs attention. In: Interspeech 2019, pp. 231\u2013235. ISCA, September 2019. https:\/\/doi.org\/10.21437\/Interspeech.2019-1780","DOI":"10.21437\/Interspeech.2019-1780"},{"key":"4_CR18","series-title":"Communications in Computer and Information Science","doi-asserted-by":"publisher","first-page":"54","DOI":"10.1007\/978-3-319-71746-3_5","volume-title":"Artificial Intelligence and Natural Language","author":"N Markovnikov","year":"2018","unstructured":"Markovnikov, N., Kipyatkova, I., Karpov, A., Filchenkov, A.: Deep neural networks in Russian speech recognition. In: Filchenkov, A., Pivovarova, L., \u017di\u017eka, J. (eds.) AINL 2017. CCIS, vol. 789, pp. 54\u201367. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-319-71746-3_5"},{"key":"4_CR19","doi-asserted-by":"crossref","unstructured":"Medennikov, I., Korenevsky, M., Prisyach, T., Khokhlov, Y., et al.: The STC system for the CHiME-6 challenge. In: CHiME 2020 Workshop on Speech Processing in Everyday Environments (2020)","DOI":"10.21437\/CHiME.2020-9"},{"key":"4_CR20","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"116","DOI":"10.1007\/978-3-319-43958-7_13","volume-title":"Speech and Computer","author":"I Medennikov","year":"2016","unstructured":"Medennikov, I., Prudnikov, A.: Advances in STC Russian spontaneous speech recognition system. In: Ronzhin, A., Potapova, R., N\u00e9meth, G. (eds.) SPECOM 2016. LNCS (LNAI), vol. 9811, pp. 116\u2013123. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-43958-7_13"},{"key":"4_CR21","doi-asserted-by":"publisher","unstructured":"Novak, J., Minematsu, N., Hirose, K.: Phonetisaurus: exploring grapheme-to-phoneme conversion with joint n-gram models in the WFST framework. Natural Language Engineering, pp. 1\u201332, September 2015. https:\/\/doi.org\/10.1017\/S1351324915000315 . https:\/\/github.com\/AdolfVonKleist\/Phonetisaurus","DOI":"10.1017\/S1351324915000315"},{"key":"4_CR22","unstructured":"Park, D.S., Zhang, Y., Jia, Y., Han, W., et al.: Improved noisy student training for automatic speech recognition. arXiv preprint arXiv:2005.09629 (2020). https:\/\/arxiv.org\/abs\/2005.09629v1"},{"key":"4_CR23","doi-asserted-by":"crossref","unstructured":"Povey, D., Cheng, G., Wang, Y., Li, K., et al.: Semi-orthogonal low-rank matrix factorization for deep neural networks. In: Proceedings of the Interspeech 2018, pp. 3743\u20133747. ISCA, September 2018. https:\/\/doi.org\/10.21437\/Interspeech.2018-1417","DOI":"10.21437\/Interspeech.2018-1417"},{"key":"4_CR24","unstructured":"Povey, D., et al.: The kaldi speech recognition toolkit. In: IEEE Workshop on Automatic Speech Recognition and Understanding, December 2011. https:\/\/github.com\/kaldi-asr\/kaldi"},{"key":"4_CR25","doi-asserted-by":"crossref","unstructured":"Povey, D., Peddinti, V., Galvez, D., Ghahremani, P., et al.: Purely sequence-trained neural networks for ASR based on lattice-free mmi. In: Interspeech 2016, pp. 2751\u20132755. ISCA, September 2016. https:\/\/doi.org\/10.21437\/Interspeech.2016-595","DOI":"10.21437\/Interspeech.2016-595"},{"key":"4_CR26","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-23132-7_29","volume-title":"Speech and Computer","author":"A Prudnikov","year":"2015","unstructured":"Prudnikov, A., Medennikov, I., Mendelev, V., Korenevsky, M., Khokhlov, Y.: Improving acoustic models for russian spontaneous speech recognition. In: Ronzhin, A., Potapova, R., Fakotakis, N. (eds.) SPECOM 2015. LNCS (LNAI), vol. 9319, pp. 234\u2013242. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-23132-7_29"},{"key":"4_CR27","doi-asserted-by":"crossref","unstructured":"Ravanelli, M., Parcollet, T., Bengio, Y.: The pytorch-kaldi speech recognition toolkit. In: 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, May 2019. https:\/\/doi.org\/10.1109\/icassp.2019.8683713","DOI":"10.1109\/ICASSP.2019.8683713"},{"key":"4_CR28","doi-asserted-by":"publisher","unstructured":"Seide, F., Agarwal, A.: CNTK: Microsoft\u2019s open-source deep-learning toolkit. In: Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. Association for Computing Machinery (2016). https:\/\/doi.org\/10.1145\/2939672.2945397 . https:\/\/github.com\/Microsoft\/CNTK","DOI":"10.1145\/2939672.2945397"},{"key":"4_CR29","unstructured":"Slizhikova, A., Veysov, A., Nurtdinova, D., Voronin, D., Baburov, Y.: Russian open speech to text (STT\/ASR) dataset v1.0 (2019). https:\/\/github.com\/snakers4\/open_stt\/"},{"key":"4_CR30","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30, pp. 5998\u20136008 (2017)"},{"key":"4_CR31","unstructured":"Veysov, A.: Toward\u2019s an imagenet moment for speech-to-text. The Gradient (2020). https:\/\/thegradient.pub\/towards-an-imagenet-moment-for-speech-to-text\/"},{"key":"4_CR32","unstructured":"Veysov, A.: stt c 2020\u201305-21) [quality comparison of our stt system with other systems in the market (update 2020\u201305-21)] (2020). https:\/\/www.silero.ai\/russian-stt-benchmarks-update1\/"},{"key":"4_CR33","doi-asserted-by":"crossref","unstructured":"Watanabe, S., Hori, T., Karita, S., Hayashi, T., et al.: ESPnet: end-to-end speech processing toolkit. In: Interspeech 2018, pp. 2207\u20132211. ISCA, September 2018. https:\/\/doi.org\/10.21437\/Interspeech.2018-1456 . https:\/\/github.com\/espnet\/espnet","DOI":"10.21437\/Interspeech.2018-1456"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-60276-5_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,4,6]],"date-time":"2021-04-06T02:06:47Z","timestamp":1617674807000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-030-60276-5_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030602758","9783030602765"],"references-count":33,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-60276-5_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"29 September 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"SPECOM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Speech and Computer","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"St. Petersburg","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Russia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2020","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 October 2020","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 October 2020","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"22","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"specom2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/specom.nw.ru\/2020\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"easychair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"160","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"65","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"41% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Due to the Corona pandemic SPECOM 2020 was held as a virtual event","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}