{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,16]],"date-time":"2025-12-16T12:25:20Z","timestamp":1765887920449,"version":"3.37.3"},"reference-count":98,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2017,9,6]],"date-time":"2017-09-06T00:00:00Z","timestamp":1504656000000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2018,6]]},"DOI":"10.1007\/s11042-017-5160-5","type":"journal-article","created":{"date-parts":[[2017,9,6]],"date-time":"2017-09-06T22:59:30Z","timestamp":1504738770000},"page":"15875-15911","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":14,"title":["Speech recognition in a dialog system: from conventional to deep processing"],"prefix":"10.1007","volume":"77","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4274-4396","authenticated-orcid":false,"given":"Aldonso","family":"Becerra","sequence":"first","affiliation":[]},{"given":"J. Ismael","family":"de la Rosa","sequence":"additional","affiliation":[]},{"given":"Efr\u00e9n","family":"Gonz\u00e1lez","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2017,9,6]]},"reference":[{"key":"5160_CR1","doi-asserted-by":"publisher","unstructured":"Ali A, Zhang Y, Cardinal P, Dahak N, Vogel S, Glass J (2014) A complete KALDI recipe for building Arabic speech recognition systems. In: Proceeedings of IEEE Workshop Spokoen Language Technology (SLT), pp 525\u2013529. https:\/\/doi.org\/10.1109\/SLT.2014.7078629","DOI":"10.1109\/SLT.2014.7078629"},{"issue":"2","key":"5160_CR2","first-page":"181","volume":"6","author":"MA Anusuya","year":"2009","unstructured":"Anusuya MA, Katti SK (2009) Speech recognition by machine: a review. Int J Comput Sci Inf Secur 6(2):181\u2013205","journal-title":"Int J Comput Sci Inf Secur"},{"key":"5160_CR3","doi-asserted-by":"crossref","unstructured":"Bacchiani M, Senior A, Heigold G (2014) Asynchronous, Online, GMM-free training of a context dependent acoustic model for speech recognition. In: Proceedings of European Conference on Speech Communication and Technology, pp 1900\u20131904","DOI":"10.21437\/Interspeech.2014-430"},{"key":"5160_CR4","unstructured":"Bengio Y, Lamblin P, Popovici D, Larochelle H (2006) Greedy layer-wise training of deep networks. In: Proceedings of Neural Information Processing Systems, pp 153\u2013160"},{"issue":"3","key":"5160_CR5","doi-asserted-by":"crossref","first-page":"869","DOI":"10.1093\/ietisy\/e89-d.3.869","volume":"E89-D","author":"J Bilmes","year":"2006","unstructured":"Bilmes J (2006) What HMMs can do. IEICE Trans Inf Syst E89-D(3):869\u2013891","journal-title":"IEICE Trans Inf Syst"},{"key":"5160_CR6","volume-title":"Pattern recognition and machine learning","author":"C Bishop","year":"2006","unstructured":"Bishop C (2006) Pattern recognition and machine learning. Springer, NY"},{"key":"5160_CR7","doi-asserted-by":"crossref","unstructured":"Cai M, Shi Y, Liu J (2013) Deep maxout neural networks for speech recognition. In: Proceedings of IEEE Automatic Speech Recognition and Understanding Workshop, pp 291\u2013296","DOI":"10.1109\/ASRU.2013.6707745"},{"key":"5160_CR8","doi-asserted-by":"crossref","unstructured":"Chen X, Eversole A, Li G, Yu D, Seide F (2012) Pipelined back-propagation for context-dependent deep neural networks. In: Proceedings of INTERSPEECH","DOI":"10.21437\/Interspeech.2012-7"},{"key":"5160_CR9","doi-asserted-by":"crossref","unstructured":"Dahl GE, Yu D, Deng L, Acero A (2011) Large vocabulary continuous speech recognition with context-dependent DBN-HMMs. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing, pp 4688\u20134691","DOI":"10.1109\/ICASSP.2011.5947401"},{"issue":"1","key":"5160_CR10","doi-asserted-by":"crossref","first-page":"30","DOI":"10.1109\/TASL.2011.2134090","volume":"20","author":"GE Dahl","year":"2012","unstructured":"Dahl G E, Yu D, Deng L, Acero A (2012) Context-dependent pre-trained deep neural networks for large vocabulary speech recognition. IEEE Trans Audio Speech Lang Process 20(1):30\u201342","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"5160_CR11","doi-asserted-by":"crossref","unstructured":"Dahl GE, Sainath TN, Hinton G (2013) Improving deep neural networks for LVCSR using rectified linear units and dropout. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing, pp 8609\u20138613","DOI":"10.1109\/ICASSP.2013.6639346"},{"issue":"4","key":"5160_CR12","doi-asserted-by":"crossref","first-page":"357","DOI":"10.1109\/TASSP.1980.1163420","volume":"ASSP-28","author":"S Davis","year":"1980","unstructured":"Davis S, Mermelstein P (1980) Comparison of parametric representations for monosyllabic word recognition. IEEE Trans Acoust Speech, Signal Process ASSP-28 (4):357\u2013366","journal-title":"IEEE Trans Acoust Speech, Signal Process"},{"issue":"1","key":"5160_CR13","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1111\/j.2517-6161.1977.tb01600.x","volume":"39","author":"A Dempster","year":"1977","unstructured":"Dempster A, Laird N, Rubin D (1977) Maximum likelihood from incomplete data via the EM algorithm. J Royal Statist Soc 39(1):1\u201338","journal-title":"J Royal Statist Soc"},{"key":"5160_CR14","doi-asserted-by":"publisher","first-page":"e2","DOI":"10.1017\/atsip.2013.9","volume":"3","author":"L Deng","year":"2014","unstructured":"Deng L (2014) A tutorial survey of architectures, algorithms, and applications for deep learning. APSIPA Trans Signal Inf Process 3:e2. https:\/\/doi.org\/10.1017\/atsip.2013.9","journal-title":"APSIPA Trans Signal Inf Process"},{"issue":"5","key":"5160_CR15","doi-asserted-by":"crossref","first-page":"1060","DOI":"10.1109\/TASL.2013.2244083","volume":"21","author":"L Deng","year":"2013","unstructured":"Deng L, Li X (2013) Machine learning paradigms for speech recognition: an overview. IEEE Trans Audio Speech, Lang Process 21(5):1060\u20131089","journal-title":"IEEE Trans Audio Speech, Lang Process"},{"key":"5160_CR16","doi-asserted-by":"crossref","DOI":"10.1561\/9781601988157","volume-title":"Deep learning: methods and applications","author":"L Deng","year":"2014","unstructured":"Deng L, Yu D (2014) Deep learning: methods and applications. Now Plublishers, Washington"},{"issue":"7","key":"5160_CR17","doi-asserted-by":"crossref","first-page":"1677","DOI":"10.1109\/78.134406","volume":"39","author":"L Deng","year":"1991","unstructured":"Deng L, Kenny P, Lennig M, Gupta V, Seitz F, Mermelstein P (1991) Phonemic hidden Markov models with continuous mixture output densities for large vocabulary word recognition. IEEE Trans Signal Process 39(7):1677\u20131681","journal-title":"IEEE Trans Signal Process"},{"key":"5160_CR18","doi-asserted-by":"crossref","unstructured":"Deng L, Yu D, Platt J (2012) Scalable stacking and learning for building deep architectures. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing, pp 2133\u20132136","DOI":"10.1109\/ICASSP.2012.6288333"},{"key":"5160_CR19","doi-asserted-by":"crossref","unstructured":"Deng L, Hinton G, Kingsbury B (2013) New types of deep neural network learning for speech recognition and related applications: an overview. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing, pp 8599\u20138603","DOI":"10.1109\/ICASSP.2013.6639344"},{"key":"5160_CR20","doi-asserted-by":"publisher","unstructured":"Deng L, Li J, Huang JT, Yao K, Yu D, Seide F, Seltzer ML, Zweig G, He X, Williams J, Gong Y, Acero A (2013) Recent advances in deep learning for speech research at Microsoft. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing, pp 8604\u20138608. https:\/\/doi.org\/10.1109\/ICASSP.2013.6639345","DOI":"10.1109\/ICASSP.2013.6639345"},{"key":"5160_CR21","volume-title":"Pattern classification","author":"R Duda","year":"2001","unstructured":"Duda R, Hart P, Stork D (2001) Pattern classification. Wiley, NY"},{"issue":"3","key":"5160_CR22","doi-asserted-by":"crossref","first-page":"195","DOI":"10.1561\/2000000004","volume":"1","author":"MJF Gales","year":"2007","unstructured":"Gales MJF, Young SJ (2007) The application of hidden Markov models in speech recognition. Found Trends Signal Process 1(3):195\u2013304","journal-title":"Found Trends Signal Process"},{"issue":"2","key":"5160_CR23","doi-asserted-by":"crossref","first-page":"291","DOI":"10.1109\/89.279278","volume":"2","author":"J Gauvain","year":"1994","unstructured":"Gauvain J, Lee Ch (1994) Maximum a posteriori estimation for multivariate gaussian mixture observations of Markov chains. IEEE Trans Speech Audio Process 2 (2):291\u2013298","journal-title":"IEEE Trans Speech Audio Process"},{"key":"5160_CR24","volume-title":"Pattern recognition and image analysis","author":"E Gose","year":"1996","unstructured":"Gose E, Johnsonbaugh R, Jost S (1996) Pattern recognition and image analysis. Prentice-Hall, New Jersey"},{"issue":"4","key":"5160_CR25","first-page":"101","volume":"4","author":"S Gupta","year":"2013","unstructured":"Gupta S, Jaafar J, wan Ahmad WF, Bansal A (2013) Feature extraction using MFCC. Signal Image Process: Int J 4(4):101\u2013108","journal-title":"Signal Image Process: Int J"},{"issue":"6","key":"5160_CR26","doi-asserted-by":"crossref","first-page":"58","DOI":"10.1109\/MSP.2012.2197232","volume":"29","author":"G Heigold","year":"2012","unstructured":"Heigold G, Ney H, Schl\u00fcter R, Wiesler S (2012) Discriminative training for automatic speech recognition: modeling, criteria, optimization, implementation, and performance. IEEE Signal Process Mag 29(6):58\u201369","journal-title":"IEEE Signal Process Mag"},{"issue":"12","key":"5160_CR27","doi-asserted-by":"crossref","first-page":"2616","DOI":"10.1109\/TASL.2013.2280234","volume":"21","author":"G Heigold","year":"2013","unstructured":"Heigold G, Ney H, Schl\u00fcter R (2013) Investigations on an EM-style optimization algorithm for discriminative training of HMMs. IEEE Trans Audio Speech Lang Process 21(12):2616\u2013 2626","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"5160_CR28","volume-title":"Handbook of neural networks signal processing","author":"Y Hen Hu","year":"2002","unstructured":"Hen Hu Y, Hwang J (2002) Handbook of neural networks signal processing. CRC Press, Florida"},{"key":"5160_CR29","unstructured":"Hinton G (2010) A practical guide to training restricted Boltzmann machines. Technical Report UTML TR, pp 2010\u2013003"},{"issue":"5786","key":"5160_CR30","doi-asserted-by":"crossref","first-page":"504","DOI":"10.1126\/science.1127647","volume":"313","author":"G Hinton","year":"2006","unstructured":"Hinton G, Salakhutdinov R (2006) Reducing the dimensionality of data with neural networks. Science 313(5786):504\u2013507","journal-title":"Science"},{"issue":"7","key":"5160_CR31","doi-asserted-by":"crossref","first-page":"1527","DOI":"10.1162\/neco.2006.18.7.1527","volume":"18","author":"G Hinton","year":"2006","unstructured":"Hinton G, Osindero S, Teh Y (2006) A fast learning algorithm for deep belief nets. Neural Comput 18(7):1527\u20131554","journal-title":"Neural Comput"},{"issue":"6","key":"5160_CR32","doi-asserted-by":"crossref","first-page":"82","DOI":"10.1109\/MSP.2012.2205597","volume":"29","author":"G Hinton","year":"2012","unstructured":"Hinton G, Deng L, Yu D, Dahl GE, Mohamed A, Jaitly N, Senior A, Vanhoucke V, Nguyen P, Sainath TN, Kingsbury B (2012) Deep neural networks for acustic modeling in speech recognition. IEEE Signal Process Mag 29(6):82\u201397","journal-title":"IEEE Signal Process Mag"},{"key":"5160_CR33","unstructured":"Hinton G, Srivastava N, Krizhevsky A, Sutskever I, Salakhutdinov R (2012) Improving neural networks by preventing co-adaptation of feature detector, arXiv: 1207.0580v1"},{"key":"5160_CR34","doi-asserted-by":"publisher","first-page":"1771","DOI":"10.1162\/089976602760128018","volume":"14","author":"GE Hinton","year":"2002","unstructured":"Hinton GE (2002) Training products of experts by minimizing contrastive divergence. Neural Comput 14:1771\u20131800. https:\/\/doi.org\/10.1162\/089976602760128018","journal-title":"Neural Comput"},{"key":"5160_CR35","volume-title":"Spoken language processing: a guide to theory, algorithm and system development","author":"X Huang","year":"2001","unstructured":"Huang X, Acero A, Hon H (2001) Spoken language processing: a guide to theory, algorithm and system development. Prentice Hall, NJ"},{"key":"5160_CR36","doi-asserted-by":"crossref","unstructured":"Huang Y, Yu D, Liu C, Gong Y (2014) A comparative analytic study on the Gaussian mixture and context dependent deep neural network hidden Markov models. In: Proceedings of INTERSPEECH 2014, pp 1895\u20131899","DOI":"10.21437\/Interspeech.2014-429"},{"key":"5160_CR37","doi-asserted-by":"crossref","unstructured":"Huang Z, Li J, Weng Ch, Lee Ch (2014) Beyond cross-entropy: towards better frame-level objective functions for deep neural network training in automatic speech recognition. In: Proceeedings of INTERSPEECH 2014, pp 1214\u20131218","DOI":"10.21437\/Interspeech.2014-306"},{"key":"5160_CR38","volume-title":"Exploring deep learning methods for discovering features in speech signals. Dissertation","author":"N Jaitly","year":"2014","unstructured":"Jaitly N (2014) Exploring deep learning methods for discovering features in speech signals. Dissertation. University of Toronto, Toronto"},{"key":"5160_CR39","doi-asserted-by":"crossref","unstructured":"Jaitly N, Hinton G (2013) Using an autoencoder with deformable templates to discover features for automated speech recognition. In: Proceedings of INTERSPEECH, pp 1737\u20131740","DOI":"10.21437\/Interspeech.2013-432"},{"key":"5160_CR40","doi-asserted-by":"crossref","unstructured":"Jaitly N, Nguyen P, Senior A, Vanhoucke V (2012) Application of pretrained deep neural networks to large vocabulary conversational speech recognition. UTML TR","DOI":"10.21437\/Interspeech.2012-10"},{"issue":"4","key":"5160_CR41","doi-asserted-by":"crossref","first-page":"589","DOI":"10.1016\/j.csl.2009.08.002","volume":"24","author":"H Jiang","year":"2010","unstructured":"Jiang H (2010) Discriminative training of HMMs for automatic speech recognition: A survey. Comput Speech Lang 24(4):589\u2013608","journal-title":"Comput Speech Lang"},{"issue":"2","key":"5160_CR42","doi-asserted-by":"crossref","first-page":"307","DOI":"10.1109\/TIT.1986.1057145","volume":"IT-32","author":"BH Juang","year":"1986","unstructured":"Juang BH, Levinson SE, Sondhi M (1986) Maximum likelihood estimation for multivariate mixture observations of Markov chains. IEEE Transactions on Information Theory IT-32(2):307\u2013309","journal-title":"IEEE Transactions on Information Theory"},{"key":"5160_CR43","volume-title":"Speech and language processing: an introduction to natural language processing, computational linguistics and speech recognition","author":"D Jurafsky","year":"2008","unstructured":"Jurafsky D, Martin J (2008) Speech and language processing: an introduction to natural language processing, computational linguistics and speech recognition. Pearson, NJ"},{"issue":"1","key":"5160_CR44","first-page":"1","volume":"5","author":"K Kaur","year":"2015","unstructured":"Kaur K, Jain N (2015) Feature extraction and classification for automatic speaker recognition system \u2013 a review. Int J Adv Res Comput Sci Softw Eng 5(1):1\u20136","journal-title":"Int J Adv Res Comput Sci Softw Eng"},{"key":"5160_CR45","doi-asserted-by":"publisher","unstructured":"Li J, Yu D, Huang JT, Gong Y (2012) Improving wideband speech recognition using mixed-bandwidth training data in CD-DNN-HMM. In: Proceedings of IEEE Workshop on Spoken Language Technology SLT, pp 131\u2013136. https:\/\/doi.org\/10.1109\/SLT.2012.6424210","DOI":"10.1109\/SLT.2012.6424210"},{"key":"5160_CR46","doi-asserted-by":"crossref","first-page":"251","DOI":"10.1016\/j.neucom.2014.07.087","volume":"170","author":"X Li","year":"2015","unstructured":"Li X, Yang Y, Pang Z, Wu X (2015) A comparative study on selecting acoustic modeling units in deep neural networks based large vocabulary chinese speech recognition. Neurocomputing 170:251\u2013256","journal-title":"Neurocomputing"},{"key":"5160_CR47","unstructured":"Maas A, Hannun A, Ng A (2013) Rectifier nonlinearities improve neural network acoustic models. In: Proceedings of International Conference on Machine Learning"},{"key":"5160_CR48","doi-asserted-by":"crossref","unstructured":"Macho D, Mauuary L, No\u00e9 B, Cheng YM, Ealey D, Jou-vet D, Kelleher H, Pearce D, Saadoun F (2002) Evaluation of a noise-robust DSR front-end on Aurora databases. In: Proceedings of International Conference on Spoken Language Processing, pp 16\u201320","DOI":"10.21437\/ICSLP.2002-3"},{"key":"5160_CR49","volume-title":"Mixture models","author":"G McLachlan","year":"1988","unstructured":"McLachlan G (1988) Mixture models. Marcel Dekker, New York"},{"key":"5160_CR50","doi-asserted-by":"crossref","unstructured":"Miao Y, Metze F (2013) Improving low-resource CD-DNN-HMM using dropout and multilingual DNN training. In: Proceedings of INTERSPEECH 2013, pp 2237\u20132241","DOI":"10.21437\/Interspeech.2013-526"},{"issue":"1","key":"5160_CR51","doi-asserted-by":"crossref","first-page":"14","DOI":"10.1109\/TASL.2011.2109382","volume":"20","author":"A Mohamed","year":"2012","unstructured":"Mohamed A, Dahl GE, Hinton G (2012) Acoustic modeling using deep belief networks. IEEE Trans Audio Speech, Lang Process 20(1):14\u201322","journal-title":"IEEE Trans Audio Speech, Lang Process"},{"key":"5160_CR52","unstructured":"Mohamed A, Dahl GE, Hinton G (2009) Deep Belief Networks for phone recognition. In: Proceedings of NIPS Workshop on Deep Learning for Speech Recognition and Related Applications"},{"issue":"3","key":"5160_CR53","doi-asserted-by":"crossref","first-page":"25","DOI":"10.1109\/79.382443","volume":"12","author":"N Morgan","year":"1995","unstructured":"Morgan N, Bourlard H (1995) An introduction to hybrid HMM\/connectionist continuous speech recognition. IEEE Signal Process Mag 12(3):25\u201342","journal-title":"IEEE Signal Process Mag"},{"issue":"3","key":"5160_CR54","doi-asserted-by":"crossref","first-page":"1058","DOI":"10.1093\/ietisy\/e89-d.3.1058","volume":"E89-D","author":"S Nakagawa","year":"2006","unstructured":"Nakagawa S, Zhang W, Takahashi M (2006) Text-independent\/text-prompted speakers recognition by combining speaker-specific GMM with speaker adapted syllable-based HMM. IEICE Trans Inf Syst E89-D(3):1058\u20131065","journal-title":"IEICE Trans Inf Syst"},{"key":"5160_CR55","doi-asserted-by":"crossref","unstructured":"Niu J, Xie L, Jia L, Hu N (2013) Context-dependent deep neural networks for commercial Mandarin speech recognition applications. In: Proceedings of Asia-Pacific Signal and Information Processing Association Annual Summit and Conference","DOI":"10.1109\/APSIPA.2013.6694268"},{"issue":"4","key":"5160_CR56","doi-asserted-by":"crossref","first-page":"458","DOI":"10.1587\/transele.E94.C.458","volume":"E94C","author":"H Noguchi","year":"2011","unstructured":"Noguchi H, Miura K, Fujinaga T, Sugahara T, Kawaguchi H, Yoshimoto M (2011) VLSI Architecture of GMM Processing and Viterbi Decoder for 60,000-Word Real-Time Continuous Speech Recognition. IEICE Trans Electron E94C(4):458\u2013467","journal-title":"IEICE Trans Electron"},{"key":"5160_CR57","doi-asserted-by":"crossref","unstructured":"Pan J, Liu C, Wang Z, Hu Y, Jiang H (2012) Investigation of deep neural networks (DNN) for large vocabulary continuous speech recognition: why DNN surpass GMMs in acoustic modeling. In: Proceedings of International Symposium on Chinese Spoken Language Processing, pp 301\u2013305","DOI":"10.1109\/ISCSLP.2012.6423452"},{"issue":"9","key":"5160_CR58","doi-asserted-by":"crossref","first-page":"1215","DOI":"10.1109\/5.237532","volume":"81","author":"JW Picone","year":"1993","unstructured":"Picone JW (1993) Signal modeling techniques in speech recognition. Proc IEEE 81(9):1215\u20131247","journal-title":"Proc IEEE"},{"issue":"2","key":"5160_CR59","doi-asserted-by":"crossref","first-page":"404","DOI":"10.1016\/j.csl.2010.06.003","volume":"25","author":"D Povey","year":"2011","unstructured":"Povey D, Burget L, Agarwal M, Akyazi P, Kai F, Ghoshal A, Glembekb O, Goel N, Karafi\u00e1t M, Rastrowh A, Rose R, Schwarz P, Thomash S (2011) The subspace Gaussian mixture model - A structured model for speech recognition. Comput Speech Lang 25(2):404\u2013439","journal-title":"Comput Speech Lang"},{"key":"5160_CR60","unstructured":"Povey D, Ghoshal A, Boulianne G, Burget L, Glembek O, Goel N, Hannemann M, Motlicek P, Qian Y, Schwarz P, Silovsky J, Stemmer G, Vesely K (2011) The Kaldi speech recognition toolkit. In: Proceedings of IEEE Automatic Speech Recognition and Understanding Workshop"},{"issue":"2","key":"5160_CR61","doi-asserted-by":"crossref","first-page":"257","DOI":"10.1109\/5.18626","volume":"77","author":"L Rabiner","year":"1989","unstructured":"Rabiner L (1989) A tutorial on hidden Markov models and selected applications in speech recognition. Proceed IEEE 77(2):257\u2013286","journal-title":"Proceed IEEE"},{"key":"5160_CR62","volume-title":"Fundamentals of speech recognition","author":"L Rabiner","year":"1993","unstructured":"Rabiner L, Juang B (1993) Fundamentals of speech recognition. Prentice-Hall, New Jersey"},{"issue":"1-2","key":"5160_CR63","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1561\/2000000001","volume":"1","author":"L Rabiner","year":"2007","unstructured":"Rabiner L, Schafer R (2007) Introduction to digital speech processing. Found Trends Signal Process 1(1-2):1\u2013194","journal-title":"Found Trends Signal Process"},{"key":"5160_CR64","doi-asserted-by":"crossref","unstructured":"Rath S, Povey D, Vesel K, Cernock J (2013) Improved feature processing for deep neural networks. In: Proceedings of INTERSPEECH 2013, pp 109\u2013113","DOI":"10.21437\/Interspeech.2013-48"},{"issue":"1","key":"5160_CR65","doi-asserted-by":"crossref","first-page":"19","DOI":"10.1006\/dspr.1999.0361","volume":"10","author":"DA Reynolds","year":"2000","unstructured":"Reynolds DA, Quatieri TF, Dunn TRB (2000) Speaker verification using adapted gaussian mixture models. Digit Signal Process 10(1):19\u201341","journal-title":"Digit Signal Process"},{"key":"5160_CR66","doi-asserted-by":"crossref","first-page":"533","DOI":"10.1038\/323533a0","volume":"f323","author":"DE Rumelhart","year":"1986","unstructured":"Rumelhart DE, Hinton G, Williams RJ (1986) Learning representations by back-propagating errors. Nature f323:533\u2013536","journal-title":"Nature"},{"key":"5160_CR67","doi-asserted-by":"crossref","unstructured":"Sainath TN, Kingsbury B, Ramabhadran B, Fousek P, Novak P, Mohamed A (2011) Making Deep Belief Networks effective for large vocabulary continuous speech recognition. In: Proceedings of IEEE Automatic Speech Recognition and Understanding Workshop, pp 30\u201335","DOI":"10.1109\/ASRU.2011.6163900"},{"key":"5160_CR68","unstructured":"Sainath T N, Kingsbury B, Ramabhadran B (2012) Improving training time of deep belief networks through hybrid pre-training and larger batch sizes. In: Proceedings of Neural Information Processing Systems, Workshop on Log-linear Models"},{"key":"5160_CR69","doi-asserted-by":"crossref","unstructured":"Sainath TN, Mohamed A, Kingsbury B, Ramabhadran B (2013) Deep Convolutional neural networks for LVCSR. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing, pp 8614\u20138618","DOI":"10.1109\/ICASSP.2013.6639347"},{"issue":"6","key":"5160_CR70","doi-asserted-by":"crossref","first-page":"18","DOI":"10.1109\/MSP.2012.2197156","volume":"29","author":"G Saon","year":"2012","unstructured":"Saon G, Chien J (2012) Large-vocabulary continuous speech recognition systems: a look at some recent advances. IEEE Signal Process Mag 29(6):18\u201333","journal-title":"IEEE Signal Process Mag"},{"key":"5160_CR71","unstructured":"Saon G, Chien J (2012) Recent developments in large vocabulary continuous speech recognition. In: Proceedings of Asia Pacific Signal and Information Processing Association"},{"key":"5160_CR72","unstructured":"Scowen R (1993) Extended bnf - generic base standards. In: Proceedings of Software Engineering Standards Symp, pp 25\u201334"},{"key":"5160_CR73","doi-asserted-by":"crossref","unstructured":"Seide F, Li G, Chen X, Yu D (2011) Feature engineering in context-dependent deep neural networks for conversational speech transcription. In: Proceedings of IEEE Automatic Speech Recognition and Understanding Workshop, pp 24\u201329","DOI":"10.1109\/ASRU.2011.6163899"},{"key":"5160_CR74","doi-asserted-by":"crossref","unstructured":"Seide F, Li G, Yu D (2011) Conversational speech transcription using context-dependent deep neural networks. In: Proceedings of INTERSPEECH 2011, pp 437\u2013440","DOI":"10.21437\/Interspeech.2011-169"},{"key":"5160_CR75","doi-asserted-by":"crossref","unstructured":"Seltzer ML, Yu D, Wang Y (2013) An Investigation of deep neural networks for noise robust speech recognition. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing, pp 7398\u20137402","DOI":"10.1109\/ICASSP.2013.6639100"},{"key":"5160_CR76","unstructured":"Senior A, Heigold G, Bacchiani M, Liao H (2014) GMM-free DNN training. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing, pp 5639\u20135643"},{"key":"5160_CR77","doi-asserted-by":"crossref","unstructured":"Sharma S, Ellis D, Kajarekar S, Jain P, Hermansky H (2000) Feature extraction using non-linear transformation for robust speech recognition on the aurora database. In: Proceedings of IEEE International Conference on Acoustics, Speechs and Signal Processing, pp II1117\u2013II1120","DOI":"10.1109\/ICASSP.2000.859160"},{"issue":"2013","key":"5160_CR78","first-page":"148","volume":"106","author":"SM Siniscalchi","year":"2012","unstructured":"Siniscalchi SM, Yu D, Deng L, Lee Ch (2012) Exploiting deep neural networks for detection- based speech recognition. Neurocomputing 106(2013):148\u2013157","journal-title":"Neurocomputing"},{"key":"5160_CR79","unstructured":"Stahlberg F, Schlippe T, Stephan V, Schultz T (2014) Towards automatic speech recognition without pronunciation dictionary, transcribed speech and text resources in the target language using cross-lingual word-to-phoneme alignment. In: Proceedings of Workshop on Spoken Language Technologies for Under-Resourced Languages, pp 73\u201380"},{"key":"5160_CR80","first-page":"121","volume":"2","author":"H Strik","year":"1997","unstructured":"Strik H, Russel A, Van Den Heuvel H, Cucchiarini C, Boves L (1997) A spoken dialog system for the dutch public transport information service. Int J Technol 2:121\u2013131","journal-title":"Int J Technol"},{"issue":"6","key":"5160_CR81","doi-asserted-by":"crossref","first-page":"1122","DOI":"10.1109\/TNNLS.2015.2461554","volume":"27","author":"D Tao","year":"2016","unstructured":"Tao D, Cheng Y, Song M, Lin X (2016) Manifold ranking-based matrix factorization for saliency detection. IEEE Trans Neural Netw Learn Syst 27(6):1122\u20131134","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"issue":"3","key":"5160_CR82","doi-asserted-by":"crossref","first-page":"756","DOI":"10.1109\/TCYB.2015.2414920","volume":"46","author":"D Tao","year":"2016","unstructured":"Tao D, Lin X, Jin L, Li X (2016) Principal component 2-D long short-term memory for font recognition on single chinese characters. IEEE Trans Cybern 46(3):756\u2013765","journal-title":"IEEE Trans Cybern"},{"issue":"6","key":"5160_CR83","doi-asserted-by":"crossref","first-page":"2726","DOI":"10.1109\/TIP.2016.2553446","volume":"25","author":"D Tao","year":"2016","unstructured":"Tao D, Guo Y, Song M, Li Y, Yu Z, Tang Y (2016) Person re-identification by dual-regularized KISS metric learning. IEEE Trans Image Process 25(6):2726\u20132738","journal-title":"IEEE Trans Image Process"},{"key":"5160_CR84","doi-asserted-by":"crossref","first-page":"91","DOI":"10.1016\/S0925-2312(00)00308-8","volume":"37","author":"E Trentin","year":"2001","unstructured":"Trentin E, Gori M (2001) A survey of hybrid ANN\/HMM models for automatic speech recognition. Neurocomputing 37:91\u2013126","journal-title":"Neurocomputing"},{"key":"5160_CR85","doi-asserted-by":"crossref","unstructured":"Vesely K, Ghoshal A, Burget L, Povey D (2013) Sequence-discriminative training of deep neural networks. In: Proceedings of INTERSPEECH 2013, pp 2345\u20132349","DOI":"10.21437\/Interspeech.2013-548"},{"key":"5160_CR86","doi-asserted-by":"crossref","unstructured":"Vesely K, Hannemann M, Burget L (2013) Semi-Supervised training of Deep Neural Networks. In: Proceedings of IEEE Conference of Automatic Speech Recognition and Understanding Workshop, pp 267\u2013272","DOI":"10.1109\/ASRU.2013.6707741"},{"key":"5160_CR87","volume-title":"Context-dependent acoustic modelling for speech recognition. Dissertation","author":"G Wang","year":"2014","unstructured":"Wang G (2014) Context-dependent acoustic modelling for speech recognition. Dissertation. National University of Singapur, Singapur"},{"issue":"1","key":"5160_CR88","first-page":"1070","volume":"21","author":"Y Xu","year":"2014","unstructured":"Xu Y, Du J, Dai L R, Lee C h (2014) An experimental study on speech enhancement based on deep neural networks. IEEE Signal Process Lett 21(1):1070\u20139908","journal-title":"IEEE Signal Process Lett"},{"key":"5160_CR89","doi-asserted-by":"crossref","unstructured":"Yao K, Yu D, Seide F, Su H, Deng L, Gong Y (2012) Adaptation of context-dependent deep neural networks for automatic speech recognition. In: Proceedings of IEEE Spoken Language Technology Workshop, pp 366\u2013369","DOI":"10.1109\/SLT.2012.6424251"},{"issue":"5","key":"5160_CR90","doi-asserted-by":"crossref","first-page":"45","DOI":"10.1109\/79.536824","volume":"13","author":"S Young","year":"1996","unstructured":"Young S (1996) Large vocabulary continuous speech recognition: a review. IEEE Signal Process Mag 13(5):45\u201357","journal-title":"IEEE Signal Process Mag"},{"key":"5160_CR91","doi-asserted-by":"crossref","unstructured":"Young S (2008) HMMs and related speech recognition technologies. In: Benesty J (ed) Springer Handbook of Speech Processing. Springer Berlin Heidelberg, Berlin, pp 539\u2013558","DOI":"10.1007\/978-3-540-49127-9_27"},{"key":"5160_CR92","volume-title":"The HTK Book (for version 3.4)","author":"S Young","year":"2006","unstructured":"Young S, Evermann G, Gales M, Hain T, Kershaw D, Liu X, Moore G, Odell J, Ollason D, Povey D, Valtchev V, Woodland P (2006) The HTK Book (for version 3.4). Cambridge University Engineering Department, UK"},{"key":"5160_CR93","doi-asserted-by":"crossref","DOI":"10.1007\/978-1-4471-5779-3","volume-title":"Automatic speech recognition: a deep learning approach","author":"D Yu","year":"2015","unstructured":"Yu D, Deng L (2015) Automatic speech recognition: a deep learning approach. Springer, London"},{"key":"5160_CR94","unstructured":"Yu D, Deng L, Dahl GE (2010) Roles of pretraining and fine-tuning in context-dependent DBN-HMMs for real-world speech recognition. In: Proceedings of NIPS Workshop on Deep Learning and Unsupervised Feature Learning"},{"key":"5160_CR95","volume-title":"Discriminative pretraining of deep neural networks","author":"D Yu","year":"2011","unstructured":"Yu D, Deng L, Li G, Seide F (2011) Discriminative pretraining of deep neural networks. Patent Filing, US"},{"key":"5160_CR96","doi-asserted-by":"crossref","unstructured":"Zhang C, Woodland PC (2014) Standalone training of context-dependent deep neural network acoustic models. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing, pp 5597\u20135601","DOI":"10.1109\/ICASSP.2014.6854674"},{"key":"5160_CR97","doi-asserted-by":"crossref","unstructured":"Zhang S, Bao Y, Zhou P, Jiang H, Li-Rong D (2014) Improving deep neural networks for LVCSR using dropout and shrinking structure. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing, pp 6899\u20136903","DOI":"10.1109\/ICASSP.2014.6854927"},{"key":"5160_CR98","doi-asserted-by":"publisher","unstructured":"Zhang X, Trmal J, Povey D, Khudanpur S (2014) Improving deep neural network acoustic models using generalized maxout networks. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing. https:\/\/doi.org\/10.1109\/ICASSP.2014.6853589","DOI":"10.1109\/ICASSP.2014.6853589"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11042-017-5160-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-017-5160-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-017-5160-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,6,26]],"date-time":"2024-06-26T19:19:32Z","timestamp":1719429572000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11042-017-5160-5"}},"subtitle":["A case study applied to Spanish"],"short-title":[],"issued":{"date-parts":[[2017,9,6]]},"references-count":98,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2018,6]]}},"alternative-id":["5160"],"URL":"https:\/\/doi.org\/10.1007\/s11042-017-5160-5","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"type":"print","value":"1380-7501"},{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2017,9,6]]}}}