{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T00:52:37Z","timestamp":1740099157083,"version":"3.37.3"},"publisher-location":"Cham","reference-count":28,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319995786"},{"type":"electronic","value":"9783319995793"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-319-99579-3_40","type":"book-chapter","created":{"date-parts":[[2018,8,24]],"date-time":"2018-08-24T03:36:09Z","timestamp":1535081769000},"page":"377-386","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["End-to-End Speech Recognition in Russian"],"prefix":"10.1007","author":[{"given":"Nikita","family":"Markovnikov","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Irina","family":"Kipyatkova","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Elena","family":"Lyakso","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2018,8,25]]},"reference":[{"key":"40_CR1","doi-asserted-by":"crossref","unstructured":"Allauzen, C., Riley, M., Schalkwyk, J., Skut, W., Mohri, M.: OpenFst: a general and efficient weighted finite-state transducer library. In: Implementation and Application of Automata, pp. 11\u201323 (2007)","DOI":"10.1007\/978-3-540-76336-9_3"},{"key":"40_CR2","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. CoRR abs\/1409.0473 (2014). http:\/\/arxiv.org\/abs\/1409.0473"},{"key":"40_CR3","doi-asserted-by":"crossref","unstructured":"Bahdanau, D., Chorowski, J., Serdyuk, D., Brakel, P., Bengio, Y.: End-to-end attention-based large vocabulary speech recognition. In: 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4945\u20134949. IEEE (2016)","DOI":"10.1109\/ICASSP.2016.7472618"},{"key":"40_CR4","doi-asserted-by":"crossref","unstructured":"Chan, W., Jaitly, N., Le, Q., Vinyals, O.: Listen, attend and spell: a neural network for large vocabulary conversational speech recognition. In: 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4960\u20134964. IEEE (2016)","DOI":"10.1109\/ICASSP.2016.7472621"},{"issue":"4","key":"40_CR5","doi-asserted-by":"publisher","first-page":"359","DOI":"10.1006\/csla.1999.0128","volume":"13","author":"SF Chen","year":"1999","unstructured":"Chen, S.F., Goodman, J.: An empirical study of smoothing techniques for language modeling. Comput. Speech Lang. 13(4), 359\u2013394 (1999). https:\/\/doi.org\/10.1006\/csla.1999.0128 . http:\/\/www.sciencedirect.com\/science\/article\/pii\/S0885230899901286","journal-title":"Comput. Speech Lang."},{"key":"40_CR6","unstructured":"Cho, K., van Merrienboer, B., G\u00fcl\u00e7ehre, \u00c7., Bougares, F., Schwenk, H., Bengio, Y.: Learning phrase representations using RNN encoder-decoder for statistical machine translation. CoRR abs\/1406.1078 (2014). http:\/\/arxiv.org\/abs\/1406.1078"},{"key":"40_CR7","unstructured":"Chorowski, J.K., Bahdanau, D., Serdyuk, D., Cho, K., Bengio, Y.: Attention-based models for speech recognition. In: Advances in Neural Information Processing Systems, pp. 577\u2013585 (2015)"},{"key":"40_CR8","first-page":"191","volume":"1","author":"T Ganchev","year":"2005","unstructured":"Ganchev, T., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of various MFCC implementations on the speaker verification task. Proc. SPECOM 1, 191\u2013194 (2005)","journal-title":"Proc. SPECOM"},{"key":"40_CR9","doi-asserted-by":"crossref","unstructured":"Graves, A., Fern\u00e1ndez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning, pp. 369\u2013376. ACM (2006)","DOI":"10.1145\/1143844.1143891"},{"key":"40_CR10","unstructured":"Graves, A., Jaitly, N.: Towards end-to-end speech recognition with recurrent neural networks. In: Proceedings of the 31st International Conference on Machine Learning (ICML 2014), pp. 1764\u20131772 (2014)"},{"key":"40_CR11","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"issue":"8","key":"40_CR12","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"40_CR13","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: accelerating deep network training by reducing internal covariate shift. CoRR abs\/1502.03167 (2015). http:\/\/arxiv.org\/abs\/1502.03167"},{"key":"40_CR14","unstructured":"Kingma, D.P., Ba, J.: Adam: A method for stochastic optimization. CoRR abs\/1412.6980 (2014). http:\/\/arxiv.org\/abs\/1412.6980"},{"key":"40_CR15","doi-asserted-by":"publisher","first-page":"219","DOI":"10.1007\/978-3-319-01931-4_29","volume-title":"Speech and Computer","author":"I Kipyatkova","year":"2013","unstructured":"Kipyatkova, I., Karpov, A.: Lexicon size and language model order optimization for Russian LVCSR. In: \u017delezn\u00fd, M., Habernal, I., Ronzhin, A. (eds.) Speech and Computer, pp. 219\u2013226. Springer, Cham (2013)"},{"key":"40_CR16","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"246","DOI":"10.1007\/978-3-319-43958-7_29","volume-title":"Speech and Computer","author":"I Kipyatkova","year":"2016","unstructured":"Kipyatkova, I., Karpov, A.: DNN-based acoustic modeling for Russian speech recognition using Kaldi. In: Ronzhin, A., Potapova, R., N\u00e9meth, G. (eds.) SPECOM 2016. LNCS (LNAI), vol. 9811, pp. 246\u2013253. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-43958-7_29"},{"issue":"10","key":"40_CR17","first-page":"1995","volume":"3361","author":"Y LeCun","year":"1995","unstructured":"LeCun, Y., Bengio, Y.: Convolutional networks for images, speech, and time series. Handb. Brain Theory Neural Netw. 3361(10), 1995 (1995)","journal-title":"Handb. Brain Theory Neural Netw."},{"key":"40_CR18","doi-asserted-by":"publisher","unstructured":"Liang, M., Hu, X.: Recurrent convolutional neural network for object recognition. In: 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3367\u20133375, June 2015. https:\/\/doi.org\/10.1109\/CVPR.2015.7298958","DOI":"10.1109\/CVPR.2015.7298958"},{"key":"40_CR19","doi-asserted-by":"crossref","unstructured":"Liao, H., McDermott, E., Senior, A.: Large scale deep neural network acoustic modeling with semi-supervised training data for YouTube video transcription. In: 2013 IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), pp. 368\u2013373. IEEE (2013)","DOI":"10.1109\/ASRU.2013.6707758"},{"key":"40_CR20","series-title":"Communications in Computer and Information Science","doi-asserted-by":"publisher","first-page":"54","DOI":"10.1007\/978-3-319-71746-3_5","volume-title":"Artificial Intelligence and Natural Language","author":"N Markovnikov","year":"2018","unstructured":"Markovnikov, N., Kipyatkova, I., Karpov, A., Filchenkov, A.: Deep neural networks in Russian speech recognition. In: Filchenkov, A., Pivovarova, L., \u017di\u017eka, J. (eds.) AINL 2017. CCIS, vol. 789, pp. 54\u201367. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-319-71746-3_5"},{"key":"40_CR21","doi-asserted-by":"crossref","unstructured":"Miao, Y., Gowayyed, M., Metze, F.: EESEN: End-to-end speech recognition using deep RNN models and WFST-based decoding. In: 2015 IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), pp. 167\u2013174. IEEE (2015)","DOI":"10.1109\/ASRU.2015.7404790"},{"issue":"1","key":"40_CR22","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1006\/csla.2001.0184","volume":"16","author":"M Mohri","year":"2002","unstructured":"Mohri, M., Pereira, F., Riley, M.: Weighted finite-state transducers in speech recognition. Comput. Speech Lang. 16(1), 69\u201388 (2002)","journal-title":"Comput. Speech Lang."},{"key":"40_CR23","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"343","DOI":"10.1007\/978-3-319-66429-3_33","volume-title":"Speech and Computer","author":"B Popovi\u0107","year":"2017","unstructured":"Popovi\u0107, B., Pakoci, E., Pekar, D.: End-to-End large vocabulary speech recognition for the Serbian language. In: Karpov, A., Potapova, R., Mporas, I. (eds.) SPECOM 2017. LNCS (LNAI), vol. 10458, pp. 343\u2013352. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-66429-3_33"},{"key":"40_CR24","unstructured":"Povey, D., et al.: The kaldi speech recognition toolkit. In: IEEE 2011 workshop on automatic speech recognition and understanding, No. EPFL-CONF-192584, IEEE Signal Processing Society (2011)"},{"key":"40_CR25","doi-asserted-by":"publisher","unstructured":"Ravindran, S., Demirogulu, C., Anderson, D.V.: Speech recognition using filter-bank features. In: The Thrity-Seventh Asilomar Conference on Signals, Systems Computers, vol. 2, pp. 1900\u20131903, November 2003. https:\/\/doi.org\/10.1109\/ACSSC.2003.1292312","DOI":"10.1109\/ACSSC.2003.1292312"},{"key":"40_CR26","doi-asserted-by":"crossref","unstructured":"Soltau, H., Liao, H., Sak, H.: Neural speech recognizer: Acoustic-to-word LSTM model for large vocabulary speech recognition (2016)","DOI":"10.21437\/Interspeech.2017-1566"},{"key":"40_CR27","unstructured":"Sutskever, I., Vinyals, O., Le, Q.V.: Sequence to sequence learning with neural networks. In: Advances in neural information processing systems, pp. 3104\u20133112 (2014)"},{"key":"40_CR28","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"338","DOI":"10.1007\/978-3-319-43958-7_40","volume-title":"Speech and Computer","author":"V Verkhodanova","year":"2016","unstructured":"Verkhodanova, V., Ronzhin, A., Kipyatkova, I., Ivanko, D., Karpov, A., \u017delezn\u00fd, M.: HAVRUS corpus: high-speed recordings of audio-visual Russian speech. In: Ronzhin, A., Potapova, R., N\u00e9meth, G. (eds.) SPECOM 2016. LNCS (LNAI), vol. 9811, pp. 338\u2013345. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-43958-7_40"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-99579-3_40","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,10,22]],"date-time":"2019-10-22T18:06:56Z","timestamp":1571767616000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-99579-3_40"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783319995786","9783319995793"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-99579-3_40","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2018]]}}}