{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,16]],"date-time":"2025-12-16T12:31:46Z","timestamp":1765888306820,"version":"3.37.3"},"reference-count":102,"publisher":"Springer Science and Business Media LLC","issue":"27-28","license":[{"start":{"date-parts":[[2020,3,27]],"date-time":"2020-03-27T00:00:00Z","timestamp":1585267200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,3,27]],"date-time":"2020-03-27T00:00:00Z","timestamp":1585267200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2020,7]]},"DOI":"10.1007\/s11042-020-08782-0","type":"journal-article","created":{"date-parts":[[2020,3,27]],"date-time":"2020-03-27T12:02:58Z","timestamp":1585310578000},"page":"19669-19715","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["A comparative case study of neural network training by using frame-level cost functions for automatic speech recognition purposes in Spanish"],"prefix":"10.1007","volume":"79","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4274-4396","authenticated-orcid":false,"given":"Aldonso","family":"Becerra","sequence":"first","affiliation":[]},{"given":"J. Ismael de la","family":"Rosa","sequence":"additional","affiliation":[]},{"given":"Efr\u00e9n","family":"Gonz\u00e1lez","sequence":"additional","affiliation":[]},{"given":"A. David","family":"Pedroza","sequence":"additional","affiliation":[]},{"given":"N. Iracemi","family":"Escalante","sequence":"additional","affiliation":[]},{"given":"Eduardo","family":"Santos","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,3,27]]},"reference":[{"key":"8782_CR1","doi-asserted-by":"publisher","unstructured":"Ali A, Zhang Y, Cardinal P, Dahak N, Vogel S, Glass J (2014) A complete kaldi recipe for building arabic speech recognition systems. In: Proceeedings of IEEE workshop spoken language technology (SLT). https:\/\/doi.org\/10.1109\/SLT.2014.7078629, pp 525\u2013529","DOI":"10.1109\/SLT.2014.7078629"},{"key":"8782_CR2","doi-asserted-by":"crossref","unstructured":"Allauzen C, Riley M, Schalkwyk J, Skut W, Mohri M (2007) Openfst: a general and efficient weighted finite-state transducer library. In: Proceedings of int conf. on implementation and application of automata (CIAA), pp 11\u201323","DOI":"10.1007\/978-3-540-76336-9_3"},{"key":"8782_CR3","doi-asserted-by":"publisher","first-page":"31","DOI":"10.1016\/j.neucom.2015.02.092","volume":"174","author":"AD Alm\u00e1si","year":"2015","unstructured":"Alm\u00e1si AD, Wo\u017aniak S, Wo\u017aniak W, Cristea V, Leblebici Y, Engbersen T (2015) Review of advances in neural networks: neural design technology stack. Neurocomputing 174:31\u201341. https:\/\/doi.org\/10.1016\/j.neucom.2015.02.092","journal-title":"Neurocomputing"},{"key":"8782_CR4","volume-title":"Introduction to machine learning","author":"E Alpaydin","year":"2010","unstructured":"Alpaydin E (2010) Introduction to machine learning. MIT Press, Massachusetts"},{"issue":"2","key":"8782_CR5","first-page":"181","volume":"6","author":"MA Anusuya","year":"2009","unstructured":"Anusuya MA, Katti SK (2009) Speech recognition by machine: a review. Int J Comput Sci Inf Secur 6(2):181\u2013205","journal-title":"Int J Comput Sci Inf Secur"},{"key":"8782_CR6","doi-asserted-by":"crossref","unstructured":"Astudillo RF, Correia J, Trancoso I (2015) Integration of DNN based speech enhancement and ASR. In: Proceedings of Interspeech, pp 3576\u20133580","DOI":"10.21437\/Interspeech.2015-709"},{"key":"8782_CR7","doi-asserted-by":"crossref","unstructured":"Bacchiani M, Senior A, Heigold G (2014) Asynchronous, online, gmm-free training of a context dependent acoustic model for speech recognition. In: Proceedings of Interspeech, pp 1900\u20131904","DOI":"10.21437\/Interspeech.2014-430"},{"key":"8782_CR8","doi-asserted-by":"publisher","unstructured":"Becerra A, de la Rosa J, Gonz\u00e1lez E (2016) A case study of speech recognition in Spanish: from conventional to deep approach. In: Proceedings of IEEE ANDESCON. https:\/\/doi.org\/10.1109\/ANDESCON.2016.7836212","DOI":"10.1109\/ANDESCON.2016.7836212"},{"key":"8782_CR9","doi-asserted-by":"publisher","unstructured":"Becerra A, de la Rosa J, Gonz\u00e1lez E (2017) Speech recognition using deep neural networks trained with non-uniform frame-level cost functions. In: Proceedings of IEEE international autumn meeting on power, electronics and computing (ROPEC). https:\/\/doi.org\/10.1109\/ROPEC.2017.8261588","DOI":"10.1109\/ROPEC.2017.8261588"},{"issue":"77","key":"8782_CR10","doi-asserted-by":"publisher","first-page":"15,875","DOI":"10.1007\/s11042-017-5160-5","volume":"12","author":"A Becerra","year":"2018","unstructured":"Becerra A, de la Rosa J, Gonz\u00e1lez E (2018) Speech recognition in a dialog system: from conventional to deep processing. a case study applied to Spanish. Multimed Tools Appl 12(77):15,875\u201315,911. https:\/\/doi.org\/10.1007\/s11042-017-5160-5","journal-title":"Multimed Tools Appl"},{"issue":"77","key":"8782_CR11","doi-asserted-by":"publisher","first-page":"27,231","DOI":"10.1007\/s11042-018-5917-5","volume":"20","author":"A Becerra","year":"2018","unstructured":"Becerra A, de la Rosa J, Gonz\u00e1lez E, Pedroza A, Escalante N (2018) Training deep neural networks with non-uniform frame-level cost function for automatic speech recognition. Multimed Tools Appl 20(77):27,231\u201327,267. https:\/\/doi.org\/10.1007\/s11042-018-5917-5","journal-title":"Multimed Tools Appl"},{"issue":"1","key":"8782_CR12","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1561\/2200000006","volume":"2","author":"Y Bengio","year":"2009","unstructured":"Bengio Y (2009) Learning deep architectures for ai. Found Trends Mach Learn 2(1):1\u2013127. https:\/\/doi.org\/10.1561\/2200000006","journal-title":"Found Trends Mach Learn"},{"issue":"3","key":"8782_CR13","doi-asserted-by":"crossref","first-page":"869","DOI":"10.1093\/ietisy\/e89-d.3.869","volume":"E89-D","author":"J Bilmes","year":"2006","unstructured":"Bilmes J (2006) What hmms can do. IEICE Trans Inf Syst E E89-D(3):869\u2013891","journal-title":"IEICE Trans Inf Syst E"},{"key":"8782_CR14","volume-title":"Pattern recognition and machine learning","author":"C Bishop","year":"2006","unstructured":"Bishop C (2006) Pattern recognition and machine learning. Springer, New York"},{"key":"8782_CR15","volume-title":"Connectionist speech recognition: a hybrid approach","author":"H Bourlard","year":"1993","unstructured":"Bourlard H, Morgan N (1993) Connectionist speech recognition: a hybrid approach. Kluwer Academic Publishers, Norwell"},{"key":"8782_CR16","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4615-3210-1","volume-title":"Connectionist speech recognition: a hybrid approach","author":"H Bourlard","year":"1994","unstructured":"Bourlard H, Morgan N (1994) Connectionist speech recognition: a hybrid approach. Klumer Academic Publishers, Boston. https:\/\/doi.org\/10.1007\/978-1-4615-3210-1"},{"issue":"3","key":"8782_CR17","doi-asserted-by":"crossref","first-page":"489","DOI":"10.1109\/TIT.1982.1056497","volume":"28","author":"J Burbea","year":"1982","unstructured":"Burbea J, Rao R (1982) On the convexity of some divergence measures based on entropy functions. IEEE Trans Inf Theory 28(3):489\u2013495","journal-title":"IEEE Trans Inf Theory"},{"key":"8782_CR18","doi-asserted-by":"publisher","first-page":"278","DOI":"10.1016\/j.neucom.2017.08.040","volume":"275","author":"W Cao","year":"2018","unstructured":"Cao W, Wang X, Ming Z, Gao J (2018) A review on neural networks with random weights. Neurocomputing 275:278\u2013287. https:\/\/doi.org\/10.1016\/j.neucom.2017.08.040","journal-title":"Neurocomputing"},{"key":"8782_CR19","doi-asserted-by":"crossref","unstructured":"Chen X, Eversole A, Li G, Yu D, Seide F (2012) Pipelined back-propagation for context-dependent deep neural networks. In: Proceedings of Interspeech","DOI":"10.21437\/Interspeech.2012-7"},{"issue":"1","key":"8782_CR20","doi-asserted-by":"crossref","first-page":"30","DOI":"10.1109\/TASL.2011.2134090","volume":"20","author":"GE Dahl","year":"2012","unstructured":"Dahl GE, Yu D, Deng L, Acero A (2012) Context-dependent pre-trained deep neural networks for large vocabulary speech recognition. IEEE Trans Audio Speech Lang Process 20(1):30\u201342","journal-title":"IEEE Trans Audio Speech Lang Process"},{"issue":"7","key":"8782_CR21","doi-asserted-by":"crossref","first-page":"1677","DOI":"10.1109\/78.134406","volume":"39","author":"L Deng","year":"1991","unstructured":"Deng L, Kenny P, Lennig M, Gupta V, Seitz F, Mermelstein P (1991) Phonemic hidden Markov models with continuous mixture output densities for large vocabulary word recognition. IEEE Trans Signal Process 39(7):1677\u20131681","journal-title":"IEEE Trans Signal Process"},{"issue":"5","key":"8782_CR22","doi-asserted-by":"crossref","first-page":"1060","DOI":"10.1109\/TASL.2013.2244083","volume":"21","author":"L Deng","year":"2013","unstructured":"Deng L, Li X (2013) Machine learning paradigms for speech recognition: an overview. IEEE Trans Audio Speech, Lang Process 21(5):1060\u20131089","journal-title":"IEEE Trans Audio Speech, Lang Process"},{"key":"8782_CR23","volume-title":"Pattern classification","author":"R Duda","year":"2001","unstructured":"Duda R, Hart P, Stork D (2001) Pattern classification. Wiley, New York"},{"issue":"2","key":"8782_CR24","doi-asserted-by":"crossref","first-page":"291","DOI":"10.1109\/89.279278","volume":"2","author":"J Gauvain","year":"1994","unstructured":"Gauvain J, Ch L (1994) Maximum a posteriori estimation for multivariate gaussian mixture observations of Markov chains. IEEE Trans Speech Audio Process 2 (2):291\u2013298","journal-title":"IEEE Trans Speech Audio Process"},{"key":"8782_CR25","doi-asserted-by":"crossref","unstructured":"Ge Z, Iyer AN, Cheluvaraja S, Sundaram R, Ganapathiraju A (2017) Neural network based speaker classification and verification systems with enhanced features. In: Proceedings of intelligent systems conference","DOI":"10.1109\/IntelliSys.2017.8324265"},{"key":"8782_CR26","unstructured":"Golik P, Doetsch P (2013) Cross-entropy vs. squared error training: a theoretical and experimental comparison. In: Proceedings of Interspeech, pp 1756\u20131760"},{"key":"8782_CR27","unstructured":"Hagan MT, Demuth HB, Beale MH, de Jes\u00fas O (2014) Neural network design. CreateSpace, US"},{"key":"8782_CR28","volume-title":"Neural networks and learning machines","author":"S Haykin","year":"2009","unstructured":"Haykin S (2009) Neural networks and learning machines. Pearson Education, New Jersey"},{"issue":"12","key":"8782_CR29","doi-asserted-by":"crossref","first-page":"2616","DOI":"10.1109\/TASL.2013.2280234","volume":"21","author":"G Heigold","year":"2013","unstructured":"Heigold G, Ney H, Schl\u00fcter R (2013) Investigations on an em-style optimization algorithm for discriminative training of hmms. IEEE Trans Audio Speech Lang Process 21(12):2616\u20132626","journal-title":"IEEE Trans Audio Speech Lang Process"},{"issue":"6","key":"8782_CR30","doi-asserted-by":"crossref","first-page":"82","DOI":"10.1109\/MSP.2012.2205597","volume":"29","author":"G Hinton","year":"2012","unstructured":"Hinton G, Deng L, Yu D, Dahl GE, Mohamed A, Jaitly N, Senior A, Vanhoucke V, Nguyen P, Sainath TN, Kingsbury B (2012) Deep neural networks for acoustic modeling in speech recognition. IEEE Signal Proc Mag 29(6):82\u201397","journal-title":"IEEE Signal Proc Mag"},{"issue":"7","key":"8782_CR31","doi-asserted-by":"crossref","first-page":"1527","DOI":"10.1162\/neco.2006.18.7.1527","volume":"18","author":"G Hinton","year":"2006","unstructured":"Hinton G, Osindero S, Teh Y (2006) A fast learning algorithm for deep belief nets. Neural Comput 18(7):1527\u20131554","journal-title":"Neural Comput"},{"key":"8782_CR32","doi-asserted-by":"crossref","unstructured":"Huang JT, Li J, Gong Y (2015) An analysis of convolutional neural networks for speech recognition. In: Proceedings of IEEE international conference on acoustics, speech and signal processing, pp 4989\u20134993","DOI":"10.1109\/ICASSP.2015.7178920"},{"key":"8782_CR33","doi-asserted-by":"crossref","unstructured":"Huang Y, Yu D, Liu C, Gong Y (2014) A comparative analytic study on the Gaussian mixture and context dependent deep neural network hidden Markov models. In: Proceedings of Interspeech, pp 1895\u20131899","DOI":"10.21437\/Interspeech.2014-429"},{"key":"8782_CR34","doi-asserted-by":"crossref","unstructured":"Huang Z, Li J, Ch W, Ch L (2014) Beyond cross-entropy: towards better frame-level objective functions for deep neural network training in automatic speech recognition. In: Proceedings of Interspeech, pp 1214\u20131218","DOI":"10.21437\/Interspeech.2014-306"},{"issue":"4","key":"8782_CR35","doi-asserted-by":"crossref","first-page":"414","DOI":"10.1109\/89.242487","volume":"1","author":"M Hwang","year":"1993","unstructured":"Hwang M, Huang X (1993) Shared-distribution hidden Markov models for speech recognition. IEEE Trans Audio Speech Lang Process 1(4):414\u2013420","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"8782_CR36","unstructured":"Jaitly N (2014) Exploring deep learning methods for discovering features in speech signals. Ph.D. thesis, University of Toronto"},{"issue":"2","key":"8782_CR37","doi-asserted-by":"crossref","first-page":"307","DOI":"10.1109\/TIT.1986.1057145","volume":"IT-32","author":"BH Juang","year":"1986","unstructured":"Juang BH, Levinson SE, Sondhi M (1986) Maximum likelihood estimation for multivariate mixture observations of Markov chains. IEEE Trans Inform Theory IT-32(2):307\u2013309","journal-title":"IEEE Trans Inform Theory"},{"key":"8782_CR38","unstructured":"Jurafsky D, Martin J (2008) Speech and language processing: an introduction to natural language processing, computational linguistics and speech recognition. Pearson, NJ"},{"key":"8782_CR39","doi-asserted-by":"crossref","unstructured":"Kingsbury B, Sainath TN, Soltau H (2012) Scalable minimum Bayes risk training of deep neural network acoustic models using distributed hessian-free optimization. In: Proceedings of InterSpeech","DOI":"10.21437\/Interspeech.2012-3"},{"issue":"1","key":"8782_CR40","first-page":"40","volume":"30","author":"F Lad","year":"2015","unstructured":"Lad F, Sanfilippo G, Agr\u00f3 G (2015) Extropy: complementary dual of entropy. Stat Sci 30(1):40\u201358","journal-title":"Stat Sci"},{"key":"8782_CR41","doi-asserted-by":"crossref","first-page":"436","DOI":"10.1038\/nature14539","volume":"521","author":"Y LeCun","year":"2015","unstructured":"LeCun Y, Bengio Y, Hinton G (2015) Deep learning. Nature 521:436\u2013444","journal-title":"Nature"},{"key":"8782_CR42","doi-asserted-by":"publisher","first-page":"154","DOI":"10.1016\/j.neucom.2017.06.058","volume":"272","author":"G Li","year":"2017","unstructured":"Li G, Deng L, Tian L, Cui H, Han W, Pei J, Shi L (2017) Training deep neural networks with discrete state transition. Neurocomputing 272:154\u2013162. https:\/\/doi.org\/10.1016\/j.neucom.2017.06.058","journal-title":"Neurocomputing"},{"key":"8782_CR43","doi-asserted-by":"crossref","unstructured":"Li X, Wu X (2014) Labeling unsegmented sequence data with dnn-hmm and its application for speech recognition. In: Proceedings of int. symp. on Chinese spoken language processing (ISCSLP)","DOI":"10.1109\/ISCSLP.2014.6936622"},{"issue":"C","key":"8782_CR44","doi-asserted-by":"crossref","first-page":"251","DOI":"10.1016\/j.neucom.2014.07.087","volume":"170","author":"X Li","year":"2015","unstructured":"Li X, Yang Y, Pang Z, Wu X (2015) A comparative study on selecting acoustic modeling units in deep neural networks based large vocabulary chinese speech recognition. Neurocomputing 170(C):251\u2013256","journal-title":"Neurocomputing"},{"key":"8782_CR45","doi-asserted-by":"publisher","unstructured":"Liao Y, Lee H, Lee L (2015) Towards structured deep neural network for automatic speech recognition. In: Proceedings of IEEE conference of automatic speech recognition and understanding workshop (ASRU). https:\/\/doi.org\/10.1109\/ASRU.2015.7404786","DOI":"10.1109\/ASRU.2015.7404786"},{"key":"8782_CR46","doi-asserted-by":"publisher","first-page":"11","DOI":"10.1016\/j.neucom.2016.12.038","volume":"234","author":"W Liu","year":"2017","unstructured":"Liu W, Wang Z, Liu X, Zeng N, Liu Y, Alsaadi F (2017) A survey of deep neural network architectures and their applications. Neurocomputing 234:11\u201326. https:\/\/doi.org\/10.1016\/j.neucom.2016.12.038","journal-title":"Neurocomputing"},{"issue":"3","key":"8782_CR47","doi-asserted-by":"crossref","first-page":"195","DOI":"10.1561\/2000000004","volume":"1","author":"SJ Young","year":"2007","unstructured":"M G, Young SJ (2007) The application of hidden Markov models in speech recognition. Found Trends Signal Process 1(3):195\u2013304","journal-title":"Found Trends Signal Process"},{"key":"8782_CR48","volume-title":"Mixture models","author":"G McLachlan","year":"1988","unstructured":"McLachlan G (1988) Mixture models. Marcel Dekker, New York"},{"key":"8782_CR49","doi-asserted-by":"crossref","unstructured":"Miao Y, Metze F (2013) Improving low-resource cd-dnn-hmm using dropout and multilingual dnn training. In: Proceedings of InterSpeech, pp 2237\u20132241","DOI":"10.21437\/Interspeech.2013-526"},{"issue":"1","key":"8782_CR50","doi-asserted-by":"crossref","first-page":"14","DOI":"10.1109\/TASL.2011.2109382","volume":"20","author":"A Mohamed","year":"2012","unstructured":"Mohamed A, Dahl GE, Hinton G (2012) Acoustic modeling using deep belief networks. IEEE Trans Audio Speech Lang Process 20(1):14\u201322","journal-title":"IEEE Trans Audio Speech Lang Process"},{"issue":"3","key":"8782_CR51","doi-asserted-by":"crossref","first-page":"25","DOI":"10.1109\/79.382443","volume":"12","author":"N Morgan","year":"1995","unstructured":"Morgan N, Bourlard H (1995) An introduction to hybrid hmm\/connectionist continuous speech recognition. IEEE Signal Process Mag 12(3):25\u201342","journal-title":"IEEE Signal Process Mag"},{"issue":"4","key":"8782_CR52","doi-asserted-by":"crossref","first-page":"458","DOI":"10.1587\/transele.E94.C.458","volume":"E94C","author":"H Noguchi","year":"2011","unstructured":"Noguchi H, Miura K, Fujinaga T, Sugahara T, Kawaguchi H, Yoshimoto M (2011) Vlsi architecture of gmm processing and viterbi decoder for 60,000-word real-time continuous speech recognition. IEICE Trans Electron E94C (4):458\u2013467","journal-title":"IEICE Trans Electron"},{"key":"8782_CR53","doi-asserted-by":"crossref","unstructured":"Pan J, Liu C, Wang Z, Hu Y, Jiang H (2012) Investigation of deep neural networks (dnn) for large vocabulary continuous speech recognition: why dnn surpass gmms in acoustic modeling. In: Proceedings of international symposium on chinese spoken language processing, pp 301\u2013305","DOI":"10.1109\/ISCSLP.2012.6423452"},{"key":"8782_CR54","unstructured":"Povey D, Ghoshal A, Boulianne G, Burget L, Glembek O, Goel N, Hannemann M, Motlicek P, Qian Y, Schwarz P, Silovsky J, Stemmer G, Vesely K (2011) The kaldi speech recognition toolkit. In: Proceedings of IEEE automatic speech recognition and understanding workshop (ASRU)"},{"key":"8782_CR55","doi-asserted-by":"publisher","first-page":"242","DOI":"10.1016\/j.neucom.2016.06.014","volume":"214","author":"A Prieto","year":"2016","unstructured":"Prieto A, Prieto B, Ortigosa EM, Ros E, Pelayo F, Ortega J, Rojas I (2016) Neural networks: an overview of early research, current frameworks and new challenges. Neurocomputing 214:242\u2013268. https:\/\/doi.org\/10.1016\/j.neucom.2016.06.014","journal-title":"Neurocomputing"},{"key":"8782_CR56","doi-asserted-by":"crossref","unstructured":"Rabiner L (1989) A tutorial on hidden Markov models and selected applications in speech recognition. In: Proceedings of IEEE, vol 77, pp 257\u2013286","DOI":"10.1109\/5.18626"},{"key":"8782_CR57","volume-title":"Fundamentals of speech recognition","author":"L Rabiner","year":"1993","unstructured":"Rabiner L, Juang B (1993) Fundamentals of speech recognition. Prentice-Hall, New Jersey"},{"issue":"1-2","key":"8782_CR58","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1561\/2000000001","volume":"1","author":"L Rabiner","year":"2007","unstructured":"Rabiner L, Schafer R (2007) Introduction to digital speech processing. Found Trends Signal Process 1(1-2):1\u2013194","journal-title":"Found Trends Signal Process"},{"key":"8782_CR59","doi-asserted-by":"crossref","unstructured":"Rao R (1984) Use of diversity and distance measures in the analysis of qualitative data. In: Van Vark GN, Howells WW (eds) Multivariate statistical methods in physical anthropology. D. Reidel Publishing Company, Dordrecht, pp 49\u201367","DOI":"10.1007\/978-94-009-6357-3_5"},{"key":"8782_CR60","doi-asserted-by":"crossref","unstructured":"Rath S, Povey D, Vesel K, Cernock J (2013) Improved feature processing for deep neural networks. In: Proceedings of Interspeech, pp 109\u2013113","DOI":"10.21437\/Interspeech.2013-48"},{"key":"8782_CR61","doi-asserted-by":"crossref","unstructured":"Ray J, Thompson B, Shen W (2014) Comparing a high and low-level deep neural network implementation for automatic speech recognition. In: Proceedings of workshop for high performance technical computing in dynamic languages (HPTCDL), pp 41\u201346","DOI":"10.1109\/HPTCDL.2014.12"},{"issue":"1","key":"8782_CR62","doi-asserted-by":"crossref","first-page":"19","DOI":"10.1006\/dspr.1999.0361","volume":"10","author":"DA Reynolds","year":"2000","unstructured":"Reynolds DA, Quatieri TF, Trb D (2000) Speaker verification using adapted gaussian mixture models. Digital Signal Process 10(1):19\u201341","journal-title":"Digital Signal Process"},{"issue":"4","key":"8782_CR63","doi-asserted-by":"publisher","first-page":"461","DOI":"10.1162\/neco.1991.3.4.461","volume":"3","author":"MD Richard","year":"1991","unstructured":"Richard MD, Lippmann RP (1991) Neural network classifiers estimate bayesian a posteriori probabilities. Neural Comput 3(4):461\u2013483. https:\/\/doi.org\/10.1162\/neco.1991.3.4.461","journal-title":"Neural Comput"},{"issue":"3","key":"8782_CR64","first-page":"1","volume":"5","author":"T Robinson","year":"1994","unstructured":"Robinson T (1994) An application of recurrent nets to phone probability estimation. IEEE Trans Neural Netw 5(3):1\u201316","journal-title":"IEEE Trans Neural Netw"},{"key":"8782_CR65","doi-asserted-by":"publisher","unstructured":"Sainath T, Kingsbury B, Mohamed AR, E Dahl G, Saon G, Soltau H, Beran T, Aravkin A, Ramabhadran B (2013) Improvements to deep convolutional neural networks for lvcsr. In: Proceedings of IEEE conference of automatic speech recognition and understanding workshop (ASRU). https:\/\/doi.org\/10.1109\/ASRU.2013.6707749","DOI":"10.1109\/ASRU.2013.6707749"},{"key":"8782_CR66","unstructured":"Sainath TN, Kingsbury B, Ramabhadran B (2012) Improving training time of deep belief networks through hybrid pre-training and larger batch sizes. In: Proceedings of neural information processing systems, workshop on log-linear models"},{"key":"8782_CR67","doi-asserted-by":"publisher","unstructured":"Sainath TN, Kingsbury B, Ramabhadran B, Fousek P, Novak P, Mohamed A (2011) Making deep belief networks effective for large vocabulary continuous speech recognition. In: Proceedings of IEEE conference of automatic speech recognition and understanding workshop (ASRU), pp 30\u201335. https:\/\/doi.org\/10.1109\/ASRU.2011.6163900","DOI":"10.1109\/ASRU.2011.6163900"},{"issue":"11","key":"8782_CR68","doi-asserted-by":"crossref","first-page":"2267","DOI":"10.1109\/TASL.2013.2284378","volume":"21","author":"TN Sainath","year":"2013","unstructured":"Sainath TN, Kingsbury B, Soltau H, Ramabhadran B (2013) Optimization techniques to improve training speed of deep neural networks for large speech tasks. IEEE Trans Audio Speech Lang Process 21(11):2267\u20132276","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"8782_CR69","doi-asserted-by":"crossref","unstructured":"Sainath TN, Mohamed A, Kingsbury B, Ramabhadran B (2013) Deep convolutional neural networks for lvcsr. In: Proceedings of IEEE international conference on acoustics, speech and signal processing, pp 8614\u20138618","DOI":"10.1109\/ICASSP.2013.6639347"},{"key":"8782_CR70","unstructured":"Scowen R (1993) Extended bnf - generic base standards. In: Proceedings of software engineering standards symp, pp 25\u201334"},{"key":"8782_CR71","doi-asserted-by":"crossref","unstructured":"Seide F, Li G, Chen X, Yu D (2011) Feature engineering in context-dependent deep neural networks for conversational speech transcription. In: Proceedings of IEEE conference of automatic speech recognition and understanding workshop (ASRU), pp 24\u201329","DOI":"10.1109\/ASRU.2011.6163899"},{"key":"8782_CR72","doi-asserted-by":"crossref","unstructured":"Seide F, Li G, Yu D (2011) Conversational speech transcription using context-dependent deep neural networks. In: Proceedings of Interspeech, pp 437\u2013440","DOI":"10.21437\/Interspeech.2011-169"},{"key":"8782_CR73","doi-asserted-by":"crossref","unstructured":"Seki H, Yamamoto K, Nakagawa S (2014) Comparison of syllable-based and phoneme-based dnn-hmm in japanese speech recognition. In: Proceedings of int conf. of advanced informatics concept, theory and application (ICAICTA), pp 249\u2013254","DOI":"10.1109\/ICAICTA.2014.7005949"},{"key":"8782_CR74","doi-asserted-by":"crossref","unstructured":"Seltzer ML, Yu D, Wang Y (2013) An investigation of deep neural networks for noise robust speech recognition. In: Proceedings of IEEE international conference on acoustics, speech and signal processing, pp 7398\u20137402","DOI":"10.1109\/ICASSP.2013.6639100"},{"key":"8782_CR75","unstructured":"Senior A, Heigold G, Bacchiani M, Liao H (2014) Gmm-free dnn training. In: Proceedings of IEEE international conference on acoustics, speech and signal processing, pp 5639\u20135643"},{"key":"8782_CR76","doi-asserted-by":"crossref","first-page":"326","DOI":"10.1016\/j.neucom.2014.03.005","volume":"140","author":"SM Siniscalchi","year":"2014","unstructured":"Siniscalchi SM, Svendsen T, Lee CH (2014) An artificial neural network approach to automatic speech processing. Neurocomputing 140:326\u2013338","journal-title":"Neurocomputing"},{"key":"8782_CR77","doi-asserted-by":"crossref","unstructured":"Su H, Li G, Yu D, Seide F (2013) Error back propagation for sequence training of context-dependent deep networks for conversational speech transcription. In: Proceeedings of IEEE international conference on acoustics, speech and signal processing, pp 6664\u20136668","DOI":"10.1109\/ICASSP.2013.6638951"},{"issue":"1-4","key":"8782_CR78","doi-asserted-by":"crossref","first-page":"91","DOI":"10.1016\/S0925-2312(00)00308-8","volume":"37","author":"E Trentin","year":"2001","unstructured":"Trentin E, Gori M (2001) A survey of hybrid ann\/hmm models for automatic speech recognition. Neurocomputing 37(1-4):91\u2013126","journal-title":"Neurocomputing"},{"key":"8782_CR79","doi-asserted-by":"crossref","unstructured":"Vesely K, Ghoshal A, Burget L, Povey D (2013) Sequence-discriminative training of deep neural networks. In: Proceedings of Interspeech, pp 2345\u20132349","DOI":"10.21437\/Interspeech.2013-548"},{"key":"8782_CR80","doi-asserted-by":"crossref","unstructured":"Vesely K, Hannemann M, Burget L (2013) Semi-supervised training of deep neural networks. In: Proceedings of IEEE conference of automatic speech recognition and understanding workshop (ASRU), pp 267\u2013272","DOI":"10.1109\/ASRU.2013.6707741"},{"key":"8782_CR81","doi-asserted-by":"publisher","unstructured":"Vesel\u00fd K, Vesel K (2010) Parallel training of neural networks for speech recognition. In: Proceedings of Interspeech, pp 2934\u20132937. https:\/\/doi.org\/10.1007\/b100511","DOI":"10.1007\/b100511"},{"key":"8782_CR82","doi-asserted-by":"crossref","unstructured":"Vincent P, Larochelle H, Bengio Y, Manzagol P (2008) Extracting and composing robust features with denoising autoencoders. In: Proceedings of int conf on machine learning (ICML), pp 1096\u20131103","DOI":"10.1145\/1390156.1390294"},{"key":"8782_CR83","doi-asserted-by":"crossref","unstructured":"Wang G, Sim KC (2011) Sequential classification criteria for NNs in automatic speech recognition. In: Proceedings of Interspeech, pp 441\u2013444","DOI":"10.21437\/Interspeech.2011-170"},{"key":"8782_CR84","unstructured":"Wang X, Wang L, Chen J, Wu L (2016) Toward a better understanding of deep neural network\u2216r\u2216nBased acoustic modelling: an empirical investigation. In: Proceedings of 30th conference on artificial intelligence (AAAI 2016), pp 2173\u20132179"},{"key":"8782_CR85","doi-asserted-by":"publisher","unstructured":"Wei W, van Vuuren S (1998) Improved neural network training of inter-word context. In: Proceedings of IEEE international conference on acoustics, speech and signal processing, pp 1520\u20136149. https:\/\/doi.org\/10.1109\/ICASSP.1998.674476","DOI":"10.1109\/ICASSP.1998.674476"},{"key":"8782_CR86","doi-asserted-by":"crossref","unstructured":"Wiesler S, Golik P, Schluter R, Ney H (2015) Investigations on sequence training of neural networks. In: Proceedings of IEEE international conference on acoustics,speech and signal processing, pp 4565\u20134569","DOI":"10.1109\/ICASSP.2015.7178835"},{"issue":"1","key":"8782_CR87","first-page":"1070","volume":"21","author":"Y Xu","year":"2014","unstructured":"Xu Y, Du J, Dai LR, Lee C (2014) An experimental study on speech enhancement based on deep neural networks. IEEE Signal Process Lett 21(1):1070\u20139908","journal-title":"IEEE Signal Process Lett"},{"issue":"12","key":"8782_CR88","doi-asserted-by":"crossref","first-page":"1713","DOI":"10.1109\/TASLP.2014.2346313","volume":"22","author":"S Xue","year":"2014","unstructured":"Xue S, Abdel-Hamid O, Jiang H, Dai L, Liu Q (2014) Fast adaptation of deep neural network based on discriminant codes for speech recognition. IEEE Trans Audio Speech Lang Process 22(12):1713\u20131725","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"8782_CR89","doi-asserted-by":"crossref","first-page":"68","DOI":"10.1007\/978-3-319-10581-9_9","volume":"8679","author":"Z Yang","year":"2014","unstructured":"Yang Z, Zhong A, Carass A, Ying SH, Prince JL (2014) Deep learning for cerebellar ataxia classification and functional score regression. Lect Notes Comput Sci 8679:68\u201376","journal-title":"Lect Notes Comput Sci"},{"key":"8782_CR90","doi-asserted-by":"crossref","unstructured":"Yao K, Yu D, Seide F, Su H, Deng L, Gong Y (2012) Adaptation of context-dependent deep neural networks for automatic speech recognition. In: Proceedings of IEEE spoken language technology workshop (SLT), pp 366\u2013369","DOI":"10.1109\/SLT.2012.6424251"},{"issue":"5","key":"8782_CR91","doi-asserted-by":"crossref","first-page":"45","DOI":"10.1109\/79.536824","volume":"13","author":"S Young","year":"1996","unstructured":"Young S (1996) Large vocabulary continuous speech recognition: a review. IEEE Signal Process Mag 13(5):45\u201357","journal-title":"IEEE Signal Process Mag"},{"key":"8782_CR92","doi-asserted-by":"crossref","unstructured":"Young S (2008) Hmms and related speech recognition technologies. In: Benesty J (ed) Springer handbook of speech processing. Springer, Berlin, pp 539\u2013558","DOI":"10.1007\/978-3-540-49127-9_27"},{"key":"8782_CR93","unstructured":"Young S, Evermann G, Gales M, Hain T, Kershaw D, Liu X, Moore G, Odell J, Ollason D, Povey D, Valtchev V, Woodland P (2006) The HTK book (for version 3.4). Cambridge University Engineering Department, UK"},{"key":"8782_CR94","doi-asserted-by":"crossref","DOI":"10.1007\/978-1-4471-5779-3","volume-title":"Automatic speech recognition: a deep learning approach","author":"D Yu","year":"2015","unstructured":"Yu D, Deng L (2015) Automatic speech recognition: a deep learning approach. Springer, London"},{"key":"8782_CR95","unstructured":"Yu D, Deng L, Dahl GE (2010) Roles of pretraining and fine-tuning in context-dependent dnn-hmms for real-world speech recognition. In: Proceedings of NIPS workshop on deep learning and unsupervised feature learning"},{"key":"8782_CR96","doi-asserted-by":"crossref","unstructured":"Yu D, Seide G, Li G, Deng L (2012) Exploiting sparseness in deep neural networks for large vocabulary speech recognition. In: Proceedings of IEEE international conference on acoustics, speech and signal processing, pp 4409\u20134412","DOI":"10.1109\/ICASSP.2012.6288897"},{"key":"8782_CR97","doi-asserted-by":"publisher","first-page":"634","DOI":"10.1016\/j.neucom.2017.08.044","volume":"273","author":"Z Zeng","year":"2018","unstructured":"Zeng Z, Liang N, Yang X, Hoi S (2018) Multi-target deep neural networks: Theoretical analysis and implementation. Neurocomputing 273:634\u2013642. https:\/\/doi.org\/10.1016\/j.neucom.2017.08.044","journal-title":"Neurocomputing"},{"key":"8782_CR98","doi-asserted-by":"crossref","unstructured":"Zhang C, Woodland PC (2014) Standalone training of context-dependent deep neural network acoustic models. In: Proceedings of IEEE international conference on acoustics, speech and signal processing, pp 5597\u20135601","DOI":"10.1109\/ICASSP.2014.6854674"},{"issue":"4","key":"8782_CR99","doi-asserted-by":"publisher","first-page":"451","DOI":"10.1109\/5326.897072","volume":"30","author":"G Zhang","year":"2000","unstructured":"Zhang G (2000) Neural networks for classification: a survey. IEEE Transactions on Systems, Man and Cybernetics, Part C (Applications and Reviews) 30(4):451\u2013462. https:\/\/doi.org\/10.1109\/5326.897072","journal-title":"IEEE Transactions on Systems, Man and Cybernetics, Part C (Applications and Reviews)"},{"key":"8782_CR100","doi-asserted-by":"crossref","unstructured":"Zhang T (2004) Solving large scale linear prediction problems using stochastic gradient descent algorithms. In: Proceedings of int. conf. on machine learning (ICML), pp 919\u2013926","DOI":"10.1145\/1015330.1015332"},{"key":"8782_CR101","doi-asserted-by":"crossref","unstructured":"Zhao R, Li J, Gong Y (2014) Variable-component deep neural network for robust speech recognition. In: Proceedings of Interspeech","DOI":"10.1109\/SLT.2014.7078632"},{"issue":"4","key":"8782_CR102","doi-asserted-by":"crossref","first-page":"631","DOI":"10.1109\/TASLP.2015.2392944","volume":"23","author":"P Zhou","year":"2015","unstructured":"Zhou P, Jiang H, Dai L, Hu Y, Liu Q (2015) State-clustering based multiple deep neural networks modeling approach for speech recognition. IEEE Trans Audio Speech Lang Process 23(4):631\u2013642","journal-title":"IEEE Trans Audio Speech Lang Process"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-020-08782-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11042-020-08782-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-020-08782-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,29]],"date-time":"2023-09-29T05:52:42Z","timestamp":1695966762000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11042-020-08782-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,3,27]]},"references-count":102,"journal-issue":{"issue":"27-28","published-print":{"date-parts":[[2020,7]]}},"alternative-id":["8782"],"URL":"https:\/\/doi.org\/10.1007\/s11042-020-08782-0","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"type":"print","value":"1380-7501"},{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2020,3,27]]},"assertion":[{"value":"25 January 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 February 2020","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 March 2020","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Compliance with Ethical Standards"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"<!--Emphasis Type='Bold' removed-->Conflict of interests"}}]}}