{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,5]],"date-time":"2025-06-05T04:15:04Z","timestamp":1749096904273,"version":"3.41.0"},"publisher-location":"Cham","reference-count":100,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319439570"},{"type":"electronic","value":"9783319439587"}],"license":[{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016]]},"DOI":"10.1007\/978-3-319-43958-7_1","type":"book-chapter","created":{"date-parts":[[2016,8,12]],"date-time":"2016-08-12T02:11:41Z","timestamp":1470967901000},"page":"3-17","source":"Crossref","is-referenced-by-count":1,"title":["Automatic Speech Recognition Based on Neural Networks"],"prefix":"10.1007","author":[{"given":"Ralf","family":"Schl\u00fcter","sequence":"first","affiliation":[]},{"given":"Patrick","family":"Doetsch","sequence":"additional","affiliation":[]},{"given":"Pavel","family":"Golik","sequence":"additional","affiliation":[]},{"given":"Markus","family":"Kitza","sequence":"additional","affiliation":[]},{"given":"Tobias","family":"Menne","sequence":"additional","affiliation":[]},{"given":"Kazuki","family":"Irie","sequence":"additional","affiliation":[]},{"given":"Zolt\u00e1n","family":"T\u00fcske","sequence":"additional","affiliation":[]},{"given":"Albert","family":"Zeyer","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2016,8,13]]},"reference":[{"key":"1_CR1","doi-asserted-by":"crossref","unstructured":"Abdel-Hamid, O., Mohamed, A., Jiang, H., Penn, G.: Applying convolutional neural networks concepts to hybrid NN-HMM model for speech recognition. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), Kyoto, Japan, pp. 4277\u20134280, March 2012","DOI":"10.1109\/ICASSP.2012.6288864"},{"key":"1_CR2","unstructured":"Babel: US IARPA Project (2012\u20132016). http:\/\/www.iarpa.gov\/Programs\/ia\/Babel\/babel.html"},{"key":"1_CR3","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. In: International Conference on Learning Representations (ICLR), San Diego, CA, USA, May 2015"},{"key":"1_CR4","doi-asserted-by":"crossref","unstructured":"Bahdanau, D., Chorowski, J., Serdyuk, D., Brakel, P., Bengio, Y.: End-to-End attention-based large vocabulary speech recognition. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), Shanghai, China, pp. 4945\u20134949, March 2016","DOI":"10.1109\/ICASSP.2016.7472618"},{"key":"1_CR5","unstructured":"Bahdanau, D., Serdyuk, D., Brakel, P., Ke, N.R., Chorowski, J., Courville, A.C., Bengio, Y.: Task loss estimation for sequence prediction. CoRR abs\/1511.06456 (2015). http:\/\/arxiv.org\/abs\/1511.06456"},{"key":"1_CR6","unstructured":"Bengio, Y., Ducharme, R., Vincent, P.: A neural probabilistic language model. In: Advances in Neural Information Processing Systems (NIPS), Denver, CO, USA, vol. 13, pp. 932\u2013938, November 2000"},{"key":"1_CR7","first-page":"502","volume-title":"Advances in neural information processing systems i","author":"H Bourlard","year":"1989","unstructured":"Bourlard, H., Wellekens, C.J.: Links between markov models and multilayer perceptrons. In: Touretzky, D. (ed.) Advances in neural information processing systems i, pp. 502\u2013510. Morgan Kaufmann, San Mateo, CA (1989)"},{"key":"1_CR8","volume-title":"Connectionist Speech Recognition: A Hybrid Approach","author":"HA Bourlard","year":"1993","unstructured":"Bourlard, H.A., Morgan, N.: Connectionist Speech Recognition: A Hybrid Approach. Kluwer Academic Publishers, Norwell (1993)"},{"key":"1_CR9","unstructured":"Breuel, T.M.: Benchmarking of LSTM Networks. arXiv preprint (2015). arXiv:1508.02774"},{"key":"1_CR10","series-title":"Nato ASI Series F: Computer and Systems Sciences","first-page":"227","volume-title":"Neurocomputing: Algorithms, Architectures and Applications","author":"JS Bridle","year":"1989","unstructured":"Bridle, J.S.: Probabilistic interpretation of feedforward classification network outputs with relationships to statistical pattern recognition. In: Souli\u00e9, F.F., H\u00e9rault, J. (eds.) Neurocomputing: Algorithms, Architectures and Applications. Nato ASI Series F: Computer and Systems Sciences, vol. 68, pp. 227\u2013236. Springer, Heidelberg (1989)"},{"key":"1_CR11","doi-asserted-by":"crossref","unstructured":"Burget, L., Schwarz, P., Agarwal, M., Akayazi, P., Feng, K., Ghoshal, A., Glembek, O., Goel, N., Karafi\u00e1t, M., Povey, D., Rastrow, A., Rose, R.C., Thomas, S.: Multilingual acoustic modeling for speech recognition based on subspace gaussian mixture models. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), pp. 4334\u20134337 (2010)","DOI":"10.1109\/ICASSP.2010.5495646"},{"key":"1_CR12","doi-asserted-by":"crossref","unstructured":"Byrne, W., Beyerlein, P., Huerta, J.M., Khudanpur, S., Marthi, B., Morgan, J., Peterek, N., Picone, J., Vergyri, D., Wang, W.: Towards language independent acoustic modeling. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), vol. 2, pp. 1029\u20131032 (2000)","DOI":"10.1109\/ICASSP.2000.859138"},{"key":"1_CR13","doi-asserted-by":"crossref","unstructured":"Caruana, R.: Multitask learning: A knowledge-based source of inductive bias. In: International Conference on Machine Learning (ICML), pp. 41\u201348 (1993)","DOI":"10.1016\/B978-1-55860-307-3.50012-5"},{"key":"1_CR14","unstructured":"Chan, W., Jaitly, N., Le, Q.V., Vinyals, O.: Listen, Attend and Spell. CoRR abs\/1508.01211 (2015)"},{"key":"1_CR15","doi-asserted-by":"crossref","unstructured":"Chen, X., Liu, X., Gales, M., Woodland, P.: Investigation of back-off based interpolation between recurrent neural network and $$N$$ -gram language models. In: IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), Scottsdale, AZ, USA, pp. 181\u2013186, December 2015","DOI":"10.1109\/ASRU.2015.7404792"},{"key":"1_CR16","unstructured":"Chung, J., G\u00fcl\u00e7ehre, \u00c7., Cho, K., Bengio, Y.: Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling. CoRR abs\/1412.3555 (2014)"},{"key":"1_CR17","unstructured":"Clevert, D., Unterthiner, T., Hochreiter, S.: Fast and accurate deep network learning by exponential linear units (ELUs). In: International Conference on Learning Representations (ICLR), San Juan, Puerto Rico, May 2016"},{"issue":"4","key":"1_CR18","doi-asserted-by":"crossref","first-page":"357","DOI":"10.1109\/TASSP.1980.1163420","volume":"28","author":"S Davis","year":"1980","unstructured":"Davis, S., Mermelstein, P.: Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences. IEEE Trans. Acoust. Speech Signal Process. 28(4), 357\u2013366 (1980)","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"key":"1_CR19","unstructured":"Dean, J., Corrado, G., Monga, R., Chen, K., Devin, M., Mao, M., Ranzato, M.A., Senior, A., Tucker, P., Yang, K., Le, Q.V., Ng, A.Y.: Large scale distributed deep networks. In: Pereira, F., Burges, C.J.C., Bottou, L., Weinberger, K.Q. (eds.) Advances in Neural Information Processing Systems (NIPS), pp. 1223\u20131231. Nips Foundation (2012). http:\/\/books.nips.cc"},{"key":"1_CR20","doi-asserted-by":"crossref","unstructured":"Doetsch, P., Zeyer, A., Voigtlaender, P., Kulikov, I., Schl\u00fcter, R., Ney, H.: RETURNN: the RWTH extensible training framework for universal recurrent neural networks. In: Interspeech, San Francisco, CA, USA, September 2016, submitted","DOI":"10.1109\/ICASSP.2017.7953177"},{"key":"1_CR21","unstructured":"Duchi, J., Hazan, E., Singer, Y.: Adaptive Subgradient Methods for Online Learning and Stochastic Optimization. Technical Report UCB\/EECS-2010-24, EECS Department, University of California, Berkeley, March 2010"},{"key":"1_CR22","doi-asserted-by":"crossref","unstructured":"Geiger, J.T., Zhang, Z., Weninger, F., Schuller, B., Rigoll, G.: Robust speech recognition using long short-term memory recurrent neural networks for hybrid acoustic modelling. In: Interspeech, pp. 631\u2013635 (2014)","DOI":"10.21437\/Interspeech.2014-151"},{"key":"1_CR23","doi-asserted-by":"crossref","unstructured":"Golik, P., Doetsch, P., Ney, H.: Cross-entropy vs. squared error training: a theoretical and experimental comparison. In: Interspeech, Lyon, France, pp. 1756\u20131760, August 2013","DOI":"10.21437\/Interspeech.2013-436"},{"key":"1_CR24","doi-asserted-by":"crossref","unstructured":"Golik, P., T\u00fcske, Z., Schl\u00fcter, R., Ney, H.: Convolutional neural networks for acoustic modeling of raw time signal in LVCSR. In: Interspeech, pp. 26\u201330. Dresden, Germany, September 2015","DOI":"10.21437\/Interspeech.2015-6"},{"key":"1_CR25","doi-asserted-by":"crossref","unstructured":"Golik, P., T\u00fcske, Z., Schl\u00fcter, R., Ney, H.: Multilingual features based keyword search for very low-resource languages. In: Interspeech, Dresden, Germany, pp. 1260\u20131264, September 2015","DOI":"10.21437\/Interspeech.2015-316"},{"key":"1_CR26","unstructured":"Goodfellow, I.J., Warde-Farley, D., Mirza, M., Courville, A., Bengio, Y.: Maxout networks. In: International Conference on Machine Learning (ICML), Atlanta, GA, USA, June 2013"},{"key":"1_CR27","doi-asserted-by":"crossref","unstructured":"Graves, A., Mohamed, A.R., Hinton, G.: Speech recognition withdeep recurrent neural networks. In: IEEE International Conference on Acoustics, Speech, and SignalProcessing (ICASSP), pp. 6645\u20136649. IEEE (2013)","DOI":"10.1109\/ICASSP.2013.6638947"},{"key":"1_CR28","unstructured":"Graves, A.: Generating Sequences with Recurrent Neural Networks. CoRR abs\/1308.0850 (2013). http:\/\/arxiv.org\/abs\/1308.0850"},{"key":"1_CR29","doi-asserted-by":"crossref","unstructured":"Graves, A., Fern\u00e1ndez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: International Conference on Machine Learning (ICML), NY, USA, pp. 369\u2013376 (2006). http:\/\/doi.acm.org\/10.1145\/1143844.1143891","DOI":"10.1145\/1143844.1143891"},{"issue":"5","key":"1_CR30","doi-asserted-by":"crossref","first-page":"602","DOI":"10.1016\/j.neunet.2005.06.042","volume":"18","author":"A Graves","year":"2005","unstructured":"Graves, A., Schmidhuber, J.: Framewise phoneme classification with bidirectional LSTM and other neural network architectures. Neural Netw. 18(5), 602\u2013610 (2005)","journal-title":"Neural Netw."},{"key":"1_CR31","unstructured":"Greff, K., Srivastava, R.K., Koutn\u00edk, J., Steunebrink, B.R., Schmidhuber, J.: LSTM: A Search Space Odyssey. arXiv preprint (2015). arXiv:1503.04069"},{"key":"1_CR32","doi-asserted-by":"crossref","unstructured":"Gr\u00e9zl, F., Karafi\u00e1t, M., Janda, M.: Study of probabilistic and bottle-neck features in multilingual environment. In: IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp. 359\u2013364 (2011)","DOI":"10.1109\/ASRU.2011.6163958"},{"key":"1_CR33","doi-asserted-by":"crossref","unstructured":"Gr\u00e9zl, F., Karafi\u00e1t, M., Kont\u00e1r, S., \u010cernock\u00fd, J.: Probabilistic and bottle-neck features for LVCSR of meetings. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), Honolulu, HI, USA, pp. 757\u2013760, April 2007","DOI":"10.1109\/ICASSP.2007.367023"},{"key":"1_CR34","unstructured":"G\u00fcl\u00e7ehre, \u00c7., Bengio, Y.: ADASECANT: Robust Adaptive Secant Method for Stochastic Gradient. CoRR abs\/1412.7419 (2014). http:\/\/arxiv.org\/abs\/1412.7419"},{"issue":"5","key":"1_CR35","doi-asserted-by":"crossref","first-page":"14","DOI":"10.1109\/MSP.2008.926652","volume":"25","author":"X He","year":"2008","unstructured":"He, X., Deng, L., Chou, W.: Discriminative learning in sequential pattern recognition - a unifying review for optimization-oriented speech recognition. IEEE Signal Process. Mag. 25(5), 14\u201336 (2008)","journal-title":"IEEE Signal Process. Mag."},{"issue":"6","key":"1_CR36","doi-asserted-by":"crossref","first-page":"58","DOI":"10.1109\/MSP.2012.2197232","volume":"29","author":"G Heigold","year":"2012","unstructured":"Heigold, G., Schl\u00fcter, R., Ney, H., Wiesler, S.: Discriminative training for automatic speech recognition: Modeling, criteria, optimization, implementation, and performance. IEEE Signal Process. Mag. 29(6), 58\u201369 (2012)","journal-title":"IEEE Signal Process. Mag."},{"issue":"4","key":"1_CR37","doi-asserted-by":"crossref","first-page":"578","DOI":"10.1109\/89.326616","volume":"2","author":"H Hermansky","year":"1994","unstructured":"Hermansky, H., Morgan, N.: RASTA processing of speech. IEEE Trans. Speech Audio Process. 2(4), 578\u2013589 (1994)","journal-title":"IEEE Trans. Speech Audio Process."},{"issue":"4","key":"1_CR38","doi-asserted-by":"crossref","first-page":"1738","DOI":"10.1121\/1.399423","volume":"87","author":"H Hermansky","year":"1990","unstructured":"Hermansky, H.: Perceptual linear predictive (PLP) analysis of speech. J. Acoust. Soc. Am. 87(4), 1738\u20131752 (1990)","journal-title":"J. Acoust. Soc. Am."},{"key":"1_CR39","doi-asserted-by":"crossref","unstructured":"Hermansky, H., Ellis, D., Sharma, S.: Tandem connectionist feature extraction for conventional HMM systems. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), Istanbul, Turkey, vol. 3, pp. 1635\u20131638, June 2000","DOI":"10.1109\/ICASSP.2000.862024"},{"key":"1_CR40","doi-asserted-by":"crossref","unstructured":"Heymann, J., Drude, L., Chinaev, A., H\u00e4b-Umbach, R.: BLSTM supported GEV beamformer front-end for the 3rd CHiME challenge. In: Automatic Speech Recognition and Understanding Workshop (ASRU), December 2015","DOI":"10.1109\/ASRU.2015.7404829"},{"issue":"7","key":"1_CR41","doi-asserted-by":"crossref","first-page":"1527","DOI":"10.1162\/neco.2006.18.7.1527","volume":"18","author":"GE Hinton","year":"2006","unstructured":"Hinton, G.E., Osindero, S., Teh, Y.W.: A fast learning algorithm for deep belief nets. Neural Comput. 18(7), 1527\u20131554 (2006)","journal-title":"Neural Comput."},{"key":"1_CR42","unstructured":"Hochreiter, S., Bengio, Y., Frasconi, P., Schmidhuber, J.: Gradient flow in recurrent nets: The difficulty of learning long-term dependencies. In: Kolen, J., Kremer, S. (eds.) A Field Guide to Dynamical Recurrent Networks. IEEE Press, New York (2001)"},{"issue":"8","key":"1_CR43","doi-asserted-by":"crossref","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"issue":"5","key":"1_CR44","doi-asserted-by":"crossref","first-page":"359","DOI":"10.1016\/0893-6080(89)90020-8","volume":"2","author":"K Hornik","year":"1989","unstructured":"Hornik, K., Stinchcombe, M.B., White, H.: Multilayer feedforward networks are universal approximators. Neural Netw. 2(5), 359\u2013366 (1989)","journal-title":"Neural Netw."},{"key":"1_CR45","unstructured":"Huang, G., Sun, Y., Liu, Z., Sedra, D., Weinberger, K.: Deep Networks with Stochastic Depth. arXiv preprint (2016). arXiv:1603.09382"},{"key":"1_CR46","doi-asserted-by":"crossref","unstructured":"Irie, K., T\u00fcske, Z., Alkhouli, T., Schl\u00fcter, R., Ney, H.: LSTM, GRU, highway and a bit of attention: an empirical overview for language modeling in speech recognition. In: Interspeech, San Francisco, CA, USA, September 2016, submitted","DOI":"10.21437\/Interspeech.2016-491"},{"key":"1_CR47","unstructured":"Jozefowicz, R., Zaremba, W., Sutskever, I.: An empirical exploration of recurrent network architectures. In: International Conference on Machine Learning (ICML), pp. 2342\u20132350 (2015)"},{"key":"1_CR48","unstructured":"Kingma, D.P., Ba, J.: Adam: A Method for Stochastic Optimization. CoRR abs\/1412.6980 (2014). http:\/\/arxiv.org\/abs\/1412.6980"},{"key":"1_CR49","doi-asserted-by":"crossref","unstructured":"Kingsbury, B.: Lattice-based optimization of sequence classification criteria for neural-network acoustic modeling. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), Taipei, Taiwan, pp. 3761\u20133764, April 2009","DOI":"10.1109\/ICASSP.2009.4960445"},{"key":"1_CR50","doi-asserted-by":"crossref","unstructured":"Kingsbury, B., Sainath, T.N., Soltau, H.: Scalable minimum bayes risk training of deep neural network acoustic models using distributed hessian-free optimization. In: Interspeech, Portland, OR, USA, September 2012","DOI":"10.21437\/Interspeech.2012-3"},{"key":"1_CR51","unstructured":"LeCun, Y., Boser, B., Denker, J.S., Henderson, D., Howard, R.E., Hubbard, W., Jackel, L.D.: Handwritten digit recognition with a back-propagation network. In: Advances in Neural Information Processing Systems (NIPS), Denver, CO, USA, vol. 2, November 1990"},{"key":"1_CR52","doi-asserted-by":"crossref","unstructured":"Li, B., Sim, K.C.: comparison of discriminative input and output transformations for speaker adaptation in the hybrid NN\/HMM systems. In: Interspeech, Makuhari, Japan, pp. 526\u2013529, September 2010","DOI":"10.21437\/Interspeech.2010-214"},{"issue":"1","key":"1_CR53","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1162\/neco.1989.1.1.1","volume":"1","author":"RP Lippmann","year":"1989","unstructured":"Lippmann, R.P.: Review of neural networks for speech recognition. Neural Comput. 1(1), 1\u201338 (1989)","journal-title":"Neural Comput."},{"key":"1_CR54","doi-asserted-by":"crossref","unstructured":"Miao, Y., Metze, F.: Distance-aware DNNs for robust speech recognition. In: Interspeech, Dresden, Germany, pp. 761\u2013765, September 2015","DOI":"10.21437\/Interspeech.2015-252"},{"key":"1_CR55","doi-asserted-by":"crossref","unstructured":"Mikolov, T., Karafi\u00e1t, M., Burget, L., Cernock\u1ef3, J., Khudanpur, S.: Recurrent neural network based language model. In: Interspeech, Makuhari, Japan, pp. 1045\u20131048, September 2010","DOI":"10.1109\/ICASSP.2011.5947611"},{"key":"1_CR56","unstructured":"Montufar, G.F., Pascanu, R., Cho, K., Bengio, Y.: On the number of linear regions of deep neural networks. In: Advances in Neural Information Processing Systems (NIPS), pp. 2924\u20132932 (2014)"},{"key":"1_CR57","unstructured":"Nair, V., Hinton, G.E.: Rectified linear units improve restricted boltzmann machines. In: International Conference on Machine Learning (ICML), Haifa, Israel, pp. 807\u2013814, June 2010"},{"key":"1_CR58","doi-asserted-by":"crossref","unstructured":"Nakamura, M., Shikano, K.: A study of english word category prediction based on neural networks. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), Glasglow, UK, pp. 731\u2013734, May 1989","DOI":"10.1109\/ICASSP.1989.266531"},{"key":"1_CR59","unstructured":"Pascanu, R., Mikolov, T., Bengio, Y.: On the difficulty of training recurrent neural networks. arXiv preprint (2012). arxiv:1211.5063"},{"key":"1_CR60","doi-asserted-by":"crossref","unstructured":"Plahl, C., Kozielski, M., Schl\u00fcter, R., Ney, H.: Feature combination and stacking of recurrent and non-recurrent neural networks for LVCSR. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), Vancouver, Canada, pp. 6714\u20136718, May 2013","DOI":"10.1109\/ICASSP.2013.6638961"},{"key":"1_CR61","doi-asserted-by":"crossref","unstructured":"Plahl, C., Schl\u00fcter, R., Ney, H.: Cross-lingual portability of Chinese and English neural network features for French and German LVCSR. In: IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp. 371\u2013376 (2011)","DOI":"10.1109\/ASRU.2011.6163960"},{"key":"1_CR62","doi-asserted-by":"crossref","unstructured":"Qian, Y., Tan, T., Yu, D., Zhang, Y.: Integrated adaptation with multi-factor joint-learning for far-field speech recognition. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), Shanghai, China, pp. 1\u20135 (2016)","DOI":"10.1109\/ICASSP.2016.7472783"},{"key":"1_CR63","doi-asserted-by":"crossref","unstructured":"Robinson, T., Hochberg, M., Renals, S.: IPA: Improved phone modelling with recurrent neural networks. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), vol. I, pp. 37\u201340, April 1994","DOI":"10.1109\/ICASSP.1994.389361"},{"key":"1_CR64","doi-asserted-by":"crossref","first-page":"533","DOI":"10.1038\/323533a0","volume":"323","author":"DE Rumelhart","year":"1986","unstructured":"Rumelhart, D.E., Hinton, G.E., Williams, R.J.: Learning representations by back-propagating errors. Nature 323, 533\u2013536 (1986)","journal-title":"Nature"},{"key":"1_CR65","doi-asserted-by":"crossref","unstructured":"Sainath, T.N., Weiss, R.J., Senior, A., Wilson, K.W., Vinyals, O.: Learning the speech front-end with raw waveform CLDNNs. In: Interspeech, pp. 1\u20135 (2015)","DOI":"10.21437\/Interspeech.2015-1"},{"key":"1_CR66","doi-asserted-by":"crossref","unstructured":"Sak, H., Senior, A., Beaufays, F.: Long short-term memory recurrent neural network architectures for large scale acoustic modeling. In: Interspeech, Singapore, pp. 338\u2013342, September 2014","DOI":"10.21437\/Interspeech.2014-80"},{"key":"1_CR67","doi-asserted-by":"crossref","unstructured":"Saon, G., Soltau, H., Nahamoo, D., Picheny, M.: Speaker adaptation of neural network acoustic models using i-Vectors. In: IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), Olomouc, Czech Republic, pp. 55\u201359, December 2013","DOI":"10.1109\/ASRU.2013.6707705"},{"key":"1_CR68","doi-asserted-by":"crossref","unstructured":"Scanzio, S., Laface, P., Fissore, L., Gemello, R., Mana, F.: On the use of a multilingual neural network front-end. In: Interspeech, pp. 2711\u20132714 (2008)","DOI":"10.21437\/Interspeech.2008-672"},{"key":"1_CR69","doi-asserted-by":"crossref","unstructured":"Schaaf, T., Metze, F.: Analysis of gender normalization using MLP and VTLN features. In: Interspeech, pp. 306\u2013309 (2010)","DOI":"10.21437\/Interspeech.2010-117"},{"key":"1_CR70","doi-asserted-by":"crossref","unstructured":"Schl\u00fcter, R., Bezrukov, I., Wagner, H., Ney, H.: Gammatone features and feature combination for large vocabulary speech recognition. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), pp. 649\u2013652 (2007)","DOI":"10.1109\/ICASSP.2007.366996"},{"key":"1_CR71","doi-asserted-by":"crossref","unstructured":"Schultz, T., Waibel, A.: Fast bootstrapping Of LVCSR systems with multilingual phoneme sets. In: European Conference on Speech Communication and Technology (Eurospeech) (1997)","DOI":"10.21437\/Eurospeech.1997-141"},{"key":"1_CR72","doi-asserted-by":"crossref","unstructured":"Seide, F., Li, G., Chen, X., Yu, D.: Feature engineering in context-dependent deep neural networks for conversational speech transcription. In: IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), Waikoloa, HI, USA, pp. 24\u201329, December 2011","DOI":"10.1109\/ASRU.2011.6163899"},{"key":"1_CR73","doi-asserted-by":"crossref","unstructured":"Seide, F., Li, G., Yu, D.: Conversational speech transcription using context-dependent deep neural networks. In: Interspeech, Florence, Italy, pp. 437\u2013440, August 2011","DOI":"10.21437\/Interspeech.2011-169"},{"key":"1_CR74","unstructured":"Sonoda, S., Murata, N.: Neural network with unbounded activation functions is universal approximator. Appl. Comput. Harmonic Anal. (2016, in Press), Corrected Proof, Available online 17 December 2015"},{"issue":"1","key":"1_CR75","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I., Salakhutdinov, R.: Dropout: a simple way to prevent neural networks from overfitting. J. Mach. Learn. Res. 15(1), 1929\u20131958 (2014)","journal-title":"J. Mach. Learn. Res."},{"key":"1_CR76","unstructured":"Srivastava, R.K., Greff, K., Schmidhuber, J.: Training very deep networks. In: Advances in Neural Information Processing Systems (NIPS), pp. 2368\u20132376 (2015)"},{"key":"1_CR77","doi-asserted-by":"crossref","unstructured":"Stolcke, A., Gr\u00e9zl, F., Hwang, M.Y., Lei, X., Morgan, N., Vergyri, D.: Cross-domain and cross-language portability of acoustic features estimated by multilayer perceptrons. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), pp. 321\u2013324 (2006)","DOI":"10.1109\/ICASSP.2006.1660022"},{"issue":"3","key":"1_CR78","doi-asserted-by":"crossref","first-page":"517","DOI":"10.1109\/TASLP.2015.2400218","volume":"23","author":"M Sundermeyer","year":"2015","unstructured":"Sundermeyer, M., Ney, H., Schl\u00fcter, R.: From feedforward to recurrent LSTM neural networks for language modeling. IEEE\/ACM Trans. Audio Speech Lang. Process. 23(3), 517\u2013529 (2015)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"1_CR79","doi-asserted-by":"crossref","unstructured":"Sundermeyer, M., Schl\u00fcter, R., Ney, H.: LSTM neural networks for language modeling. In: Interspeech, Portland, OR, USA, pp. 194\u2013197, September 2012","DOI":"10.21437\/Interspeech.2012-65"},{"key":"1_CR80","doi-asserted-by":"crossref","unstructured":"Sundermeyer, M., T\u00fcske, Z., Schl\u00fcter, R., Ney, H.: Lattice decoding and rescoring with long-span neural network language models. In: Interspeech, Singapore, pp. 661\u2013665, September 2014","DOI":"10.21437\/Interspeech.2014-167"},{"key":"1_CR81","doi-asserted-by":"crossref","unstructured":"Thomas, S., Ganapathy, S., Hermansky, H.: Cross-lingual and multistream posterior features for low resource LVCSR systems. In: Interspeech, pp. 877\u2013880 (2010)","DOI":"10.21437\/Interspeech.2010-295"},{"key":"1_CR82","doi-asserted-by":"crossref","unstructured":"T\u00f3th, L., Frankel, J., Gosztolya, G., King, S.: Cross-lingual portability of MLP-based tandem features-a case study for English and Hungarian. In: Interspeech, pp. 2695\u20132698 (2008)","DOI":"10.21437\/Interspeech.2008-668"},{"key":"1_CR83","doi-asserted-by":"crossref","unstructured":"T\u00fcske, Z., Golik, P., Nolden, D., Schl\u00fcter, R., Ney, H.: Data augmentation, feature combination, and multilingual neural networks to improve ASR and KWS performance for low-resource languages. In: Interspeech, Singapore, pp. 1420\u20131424, September 2014","DOI":"10.21437\/Interspeech.2014-348"},{"key":"1_CR84","doi-asserted-by":"crossref","unstructured":"T\u00fcske, Z., Golik, P., Schl\u00fcter, R., Ney, H.: Acoustic modeling with deep neural networks using raw time signal for LVCSR. In: Interspeech, Singapore, pp. 890\u2013894, September 2014","DOI":"10.21437\/Interspeech.2014-223"},{"key":"1_CR85","doi-asserted-by":"crossref","unstructured":"T\u00fcske, Z., Golik, P., Schl\u00fcter, R., Ney, H.: Speaker adaptive joint training of gaussian mixture models and bottleneck features. In: IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), Scottsdale, AZ, USA, pp. 596\u2013603, December 2015","DOI":"10.1109\/ASRU.2015.7404850"},{"key":"1_CR86","doi-asserted-by":"crossref","unstructured":"T\u00fcske, Z., Irie, K., Schl\u00fcter, R., Ney, H.: Investigation on log-linear interpolation of multi-domain neural network language model. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), pp. 6005\u20136009, Shanghai, China, March 2016","DOI":"10.1109\/ICASSP.2016.7472830"},{"key":"1_CR87","doi-asserted-by":"crossref","unstructured":"T\u00fcske, Z., Nolden, D., Schl\u00fcter, R., Ney, H.: Multilingual MRASTA features for low-resource keyword search and speech recognition systems. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) (2014)","DOI":"10.1109\/ICASSP.2014.6855129"},{"key":"1_CR88","doi-asserted-by":"crossref","unstructured":"T\u00fcske, Z., Schl\u00fcter, R., Ney, H.: Multilingual hierarchical MRASTA features for ASR. In: Interspeech, pp. 2222\u20132226, Lyon, France, August 2013","DOI":"10.21437\/Interspeech.2013-523"},{"key":"1_CR89","doi-asserted-by":"crossref","unstructured":"T\u00fcske, Z., Sundermeyer, M., Schl\u00fcter, R., Ney, H.: Context-dependent MLPs for LVCSR: TANDEM, hybrid or both? In: Interspeech, Portland, OR, USA, pp. 18\u201321, September 2012","DOI":"10.21437\/Interspeech.2012-5"},{"key":"1_CR90","doi-asserted-by":"crossref","unstructured":"T\u00fcske, Z., Tahir, M.A., Schl\u00fcter, R., Ney, H.: Integrating gaussian mixtures into deep neural networks: Softmax layer with hidden variables. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), Brisbane, Australia, pp. 4285\u20134289, April 2015","DOI":"10.1109\/ICASSP.2015.7178779"},{"key":"1_CR91","doi-asserted-by":"crossref","unstructured":"Valente, F., Vepa, J., Plahl, C., Gollan, C., Hermansky, H., Schl\u00fcter, R.: Hierarchical neural networks feature extraction for LVCSR system. In: Interspeech, Antwerp, Belgium, pp. 42\u201345, August 2007","DOI":"10.21437\/Interspeech.2007-8"},{"key":"1_CR92","doi-asserted-by":"crossref","unstructured":"Waibel, A., Hanazawa, T., Hinton, G., Shikano, K., Lang, K.: Phoneme recognition: neural networks vs. hidden markov models. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), vol. 1, pp. 107\u2013110, April 1989","DOI":"10.1109\/ICASSP.1988.196523"},{"key":"1_CR93","doi-asserted-by":"crossref","unstructured":"Wiesler, S., Golik, P., Schl\u00fcter, R., Ney, H.: Investigations on sequence training of neural networks. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), Brisbane, Australia, pp. 4565\u20134569, April 2015","DOI":"10.1109\/ICASSP.2015.7178835"},{"key":"1_CR94","doi-asserted-by":"crossref","unstructured":"Wiesler, S., Li, J., Xue, J.: Investigations on hessian-free optimization for cross-entropy training of deep neural networks. In: Interspeech, Lyon, France, pp. 3317\u20133321, August 2013","DOI":"10.21437\/Interspeech.2013-734"},{"key":"1_CR95","doi-asserted-by":"crossref","unstructured":"Wiesler, S., Richard, A., Schl\u00fcter, R., Ney, H.: Mean-normalized stochastic gradient for large-scale deep learning. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), Florence, Italy, pp. 180\u2013184, May 2014","DOI":"10.1109\/ICASSP.2014.6853582"},{"key":"1_CR96","doi-asserted-by":"crossref","unstructured":"Xue, J., Li, J., Yu, D., Seltzer, M., Gong, Y.: Singular value decomposition based low-footprint speaker adaptation and personalization for deep neural network. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), Florence, Italy, pp. 6359\u20136363, May 2014","DOI":"10.1109\/ICASSP.2014.6854828"},{"key":"1_CR97","unstructured":"Zeiler, M.D.: ADADELTA: An Adaptive Learning Rate Method. CoRR abs\/1212.5701 (2012)"},{"key":"1_CR98","doi-asserted-by":"crossref","unstructured":"Zeyer, A., Doetsch, P., Voigtlaender, P., Schl\u00fcter, R., Ney, H.: A comprehensive study of deep bidirectional LSTM RNNs for acoustic modeling in speech recognition. In: Interspeech, San Francisco. CA, USA, September 2016, submitted","DOI":"10.1109\/ICASSP.2017.7952599"},{"key":"1_CR99","doi-asserted-by":"crossref","unstructured":"Zeyer, A., Schl\u00fcter, R., Ney, H.: Towards online-recognition with deep bidirectional LSTM acoustic models. In: Interspeech, San Francisco, CA, USA, September 2016, submitted","DOI":"10.21437\/Interspeech.2016-759"},{"key":"1_CR100","unstructured":"Zhang, Y., Chen, G., Yu, D., Yao, K., Khudanpur, S., Glass, J.: Highway Long Short-Term Memory RNNs for Distant Speech Recognition. arXiv preprint (2015). arxiv:1510.08983"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-43958-7_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,4]],"date-time":"2025-06-04T17:28:33Z","timestamp":1749058113000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-43958-7_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016]]},"ISBN":["9783319439570","9783319439587"],"references-count":100,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-43958-7_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2016]]}}}