{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T04:57:34Z","timestamp":1764997054011},"reference-count":53,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2015,9,4]],"date-time":"2015-09-04T00:00:00Z","timestamp":1441324800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J AUDIO SPEECH MUSIC PROC."],"published-print":{"date-parts":[[2015,12]]},"DOI":"10.1186\/s13636-015-0068-3","type":"journal-article","created":{"date-parts":[[2015,9,4]],"date-time":"2015-09-04T10:14:47Z","timestamp":1441361687000},"update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":48,"title":["Phone recognition with hierarchical convolutional deep maxout networks"],"prefix":"10.1186","volume":"2015","author":[{"given":"L\u00e1szl\u00f3","family":"T\u00f3th","sequence":"first","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2015,9,4]]},"reference":[{"key":"68_CR1","doi-asserted-by":"publisher","first-page":"887","DOI":"10.1121\/1.1945807","volume":"118","author":"T Chih","year":"2005","unstructured":"T Chih, P Ru, S Shamma, Multiresolution spectrotemporal analysis of complex sounds. J Acoust Soc Am. 118, 887\u2013906 (2005).","journal-title":"J Acoust Soc Am"},{"key":"68_CR2","unstructured":"Y Lecun, Y Bengio, in The Handbook of Brain Theory and Neural Networks, ed. by MA Arbib. Convolutional networks for images, speech and time series (MIT PressCambridge, 1995), pp. 255\u2013258."},{"issue":"1","key":"68_CR3","first-page":"14","volume":"20","author":"A Mohamed","year":"2012","unstructured":"A Mohamed, GE Dahl, G Hinton, Acoustic modeling using deep belief networks. IEEE Trans ASLP. 20(1), 14\u201322 (2012).","journal-title":"IEEE Trans ASLP"},{"issue":"1","key":"68_CR4","first-page":"30","volume":"20","author":"GE Dahl","year":"2012","unstructured":"GE Dahl, D Yu, L Deng, A Acero, Context-dependent pre-trained deep neural networks for large vocabulary speech recognition. IEEE Trans ASLP. 20(1), 30\u201342 (2012).","journal-title":"IEEE Trans ASLP"},{"key":"68_CR5","unstructured":"F Seide, G Li, L Chen, D Yu, in Proc ASRU. Feature engineering in context-dependent deep neural networks for conversational speech transcription, (2011), pp. 24\u201329."},{"key":"68_CR6","doi-asserted-by":"crossref","unstructured":"N Jaitly, P Nguyen, A Senior, V Vanhoucke, in Proc Interspeech. Application of pretrained deep neural networks to large vocabulary speech recognition, (2012).","DOI":"10.21437\/Interspeech.2012-10"},{"key":"68_CR7","unstructured":"O Abdel-Hamid, A Mohamed, H Jiang, G Penn, in Proc ICASSP. Applying convolutional neural network concepts to hybrid NN-HMM model for speech recognition, (2012), pp. 4277\u20134280."},{"key":"68_CR8","unstructured":"L Deng, O Abdel-Hamid, D Yu, in Proc ICASSP. A deep convolutional neural network using heterogeneous pooling for trading acoustic invariance with phonetic confusion, (2013), pp. 6669\u20136673."},{"key":"68_CR9","unstructured":"TN Sainath, A Mohamed, B Kingsbury, B Ramabhadran, in Proc ICASSP. Deep convolutional neural networks for LVCSR, (2013), pp. 8614\u20138618."},{"key":"68_CR10","unstructured":"L T\u00f3th, in Proc ICASSP. Combining time- and frequency-domain convolution in convolutional neural network-based phone recognition, (2014), pp. 190\u2013194."},{"key":"68_CR11","unstructured":"O Abdel-Hamid, L Deng, D Yu, in Proc Interspeech. Exploring convolutional neural network structures and optimization techniques for speech recognition, (2013), pp. 3366\u20133370."},{"key":"68_CR12","unstructured":"TN Sainath, B Kingsbury, A Mohamed, G Dahl, G Saon, H Soltau, T Beran, A Aravkin, B Ramabhadran, in Proc ASRU. Improvements to deep convolutional neural networks for LVCSR, (2013), pp. 315\u2013320."},{"key":"68_CR13","unstructured":"TN Sainath, A Mohamed, B Kingsbury, B Ramabhadran, in Proc ICASSP. Joint training of convolutional and non-convolutional neural networks, (2014), pp. 5572\u20135576."},{"key":"68_CR14","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1016\/j.neunet.2014.08.005","volume":"64","author":"TN Sainath","year":"2015","unstructured":"TN Sainath, B Kingsbury, G Saon, H Soltau, A Mohamed, G Dahl, B Ramabhadran, Deep convolutional neural networks for large-scale speech tasks. Neural Netw. 64, 39\u201348 (2015). doi: 10.1016\/j.neunet.2014.08.005 .","journal-title":"Neural Netw"},{"key":"68_CR15","unstructured":"IJ Goodfellow, D Warde-Farley, M Mirza, A Courville, Y Bengio, in Proc ICML. Maxout networks, (2013), pp. 1319\u20131327."},{"key":"68_CR16","unstructured":"X Glorot, A Bordes, Y Bengio, in Proc AISTATS. Deep sparse rectifier neural networks, (2011)."},{"key":"68_CR17","unstructured":"M Cai, Y Shi, J Liu, in Proc ASRU. Deep maxout neural networks for speech recognition, (2013), pp. 291\u2013296."},{"key":"68_CR18","unstructured":"Y Miao, F Metze, S Rawat, in Proc ASRU. Deep maxout networks for low-resource speech recognition, (2013), pp. 398\u2013403."},{"key":"68_CR19","unstructured":"P Swietojanski, J Li, JT Huang, in Proc ICASSP. Investigation of maxout networks for speech recognition, (2014), pp. 7649\u20137653."},{"key":"68_CR20","unstructured":"X Zhang, J Trmal, D Povey, S Khudanpur, in Proc ICASSP. Improving deep neural network acoustic models using generalized maxout networks, (2014), pp. 215\u2013219."},{"key":"68_CR21","unstructured":"K Vesel\u00fd, M Karafi\u00e1t, F Gr\u00e9zl, in Proc ASRU. Convolutive bottleneck network features for LVCSR, (2011), pp. 42\u201347."},{"issue":"6","key":"68_CR22","first-page":"1094","volume":"18","author":"H Ketabdar","year":"2010","unstructured":"H Ketabdar, H Bourlard, Enhanced phone posteriors for improving speech recognition systems. IEEE Trans ASLP. 18(6), 1094\u20131106 (2010).","journal-title":"IEEE Trans ASLP"},{"issue":"2","key":"68_CR23","first-page":"225","volume":"19","author":"J Pinto","year":"2010","unstructured":"J Pinto, G. S. V. S Sivaram, M Magimai-Doss, H Hermansky, H Bourlard, Analysis of MLP based hierarchical phoneme posterior probability estimator. IEEE Trans ASLP. 19(2), 225\u2013241 (2010).","journal-title":"IEEE Trans ASLP"},{"key":"68_CR24","doi-asserted-by":"publisher","unstructured":"D Vasquez, R Gruhn, W Minker, Hierarchical neural network structures for phoneme recognition (Springer, Berlin, 2013).","DOI":"10.1007\/978-3-642-34425-1"},{"key":"68_CR25","unstructured":"L T\u00f3th, in Proc ICASSP. A hierarchical, context-dependent neural network architecture for improved phone recognition, (2011), pp. 5040\u20135043."},{"key":"68_CR26","unstructured":"Y Zhang, E Chuangsuwanich, J Glass, in Proc Interspeech. Language ID-based training of multilingual stacked bottleneck features, (2014), pp. 1\u20135."},{"key":"68_CR27","unstructured":"L T\u00f3th, in Proc Interspeech. Convolutional deep rectifier neural nets for phone recognition, (2013), pp. 1722\u20131726."},{"key":"68_CR28","unstructured":"GE Hinton, N Srivastava, A Krizhevsky, I Sutskever, R Salakhutdinov, Improving neural networks by preventing co-adaptation of feature detectors. CoRR. abs\/1207.0580 (2012)."},{"key":"68_CR29","unstructured":"M Cai, Y Shi, J Liu, in Proc ICASSP. Stochastic pooling maxout networks for low-resource speech recognition, (2014), pp. 3266\u20133270."},{"issue":"11","key":"68_CR30","doi-asserted-by":"publisher","first-page":"1641","DOI":"10.1109\/29.46546","volume":"37","author":"K-F Lee","year":"1989","unstructured":"K-F Lee, H-W Hon, Speaker-independent phone recognition using hidden Markov models. IEEE Trans ASSP. 37(11), 1641\u20131648 (1989).","journal-title":"IEEE Trans ASSP"},{"key":"68_CR31","unstructured":"X Glorot, Y Bengio, in Proc AISTATS. Understanding the difficulty of training deep feedforward neural networks, (2010), pp. 249\u2013256."},{"key":"68_CR32","unstructured":"B Kingsbury, in Proc ICASSP. Lattice-based optimization of sequence classification criteria for neural-network acoustic modeling, (2009), pp. 3761\u20133764."},{"key":"68_CR33","unstructured":"K Vesel\u00fd, A Ghoshal, L Burget, D Povey, in Proc. Interspeech. Sequence-discriminative training of deep neural networks, (2013), pp. 2345\u20132349."},{"key":"68_CR34","doi-asserted-by":"publisher","unstructured":"H Bourlard, N Morgan, Connectionist speech recognition\u2014a hybrid approach (Kluwer, Boston, 1994).","DOI":"10.1007\/978-1-4615-3210-1"},{"key":"68_CR35","unstructured":"L T\u00f3th, in Proc ICASSP. Phone recognition with deep sparse rectifier neural networks, (2013), pp. 6985\u20136989."},{"key":"68_CR36","unstructured":"GE Dahl, TN Sainath, GE Hinton, in Proc ICASSP. Improving deep neural networks for LVCSR using rectified linear units and dropout, (2013), pp. 8609\u20138613."},{"key":"68_CR37","unstructured":"MD Zeiler, M Ranzato, R Monga, M Mao, K Yang, QV Le, P Nguyen, A Senior, V Vanhoucke, J Dean, GE Hinton, in Proc ICASSP. On rectified linear units for speech processing, (2013), pp. 3517\u20133521."},{"key":"68_CR38","unstructured":"AL Maas, AY Hannun, AY Ng, in Proc ICML. Rectifier nonlinearities improve neural network acoustic models, (2013)."},{"key":"68_CR39","unstructured":"J-T Huang, J Li, Y Gong, in Proc ICASSP. An analysis of convolutional neural networks for speech recognition, (2015), pp. 4989\u20134993."},{"key":"68_CR40","unstructured":"Y Miao, F Metze, in Proc Interspeech. Convolutional neural networks for language-universal feature extraction and cross-language hybrid systems, (2014), pp. 800\u2013804."},{"key":"68_CR41","unstructured":"M Cai, Y Shi, J Kang, J Liu, T Su, in Proc ISCSLP. Convolutional maxout neural networks for low-resource speech recognition, (2014), pp. 133\u2013137."},{"key":"68_CR42","doi-asserted-by":"crossref","unstructured":"S Renals, P Swietojanski, in Proc HSCMA. Neural networks for distant speech recognition, (2014).","DOI":"10.1109\/HSCMA.2014.6843274"},{"key":"68_CR43","unstructured":"MD Zeiler, R Fergus, Stochastic pooling for regularization of deep convolutional neural networks. CoRR. abs\/1301.3557 (2013)."},{"key":"68_CR44","unstructured":"H Hermansky, D Ellis, S Sharma, in Proc ICASSP. Tandem connectionist feature extraction for conventional HMM systems, (2000), pp. 1635\u20131638."},{"key":"68_CR45","unstructured":"C Plahl, R Schl\u00fcter, H Ney, in Proc Interspeech. Hierarchical bottle neck features for LVCSR, (2010), pp. 1197\u20131200."},{"key":"68_CR46","unstructured":"D V\u00e1squez, G Aradilla, R Gruhn, W Minker, in Proc ASRU. A hierarchical structure for modeling inter and intra phonetic information for phoneme recognition, (2009), pp. 124\u2013129."},{"key":"68_CR47","doi-asserted-by":"crossref","unstructured":"T Gr\u00f3sz, L T\u00f3th, in Text, Speech and Dialogue, ed. by I Habernal, V Matousek. A comparison of deep neural network training methods for large vocabulary speech recognition (Springer, Berlin, 2013), pp. 36\u201343.","DOI":"10.1007\/978-3-642-40585-3_6"},{"key":"68_CR48","unstructured":"G Gosztolya, T Gr\u00f3sz, L T\u00f3th, D Imseng, in Proc ICASSP. Building context-dependent DNN acoustic models using Kullback-Leibler divergence-based state tying, (2015), pp. 4570\u20134574."},{"key":"68_CR49","unstructured":"L Deng, J Chen, in Proc ICASSP. Sequence classification using the high-level features extracted from deep neural networks, (2014), pp. 6844\u20136848."},{"key":"68_CR50","unstructured":"C Plahl, TN Sainath, B Ramabhadran, D Nahamoo, in Proc ICASSP. Improved pre-training of deep belief networks using sparse encoding symmetric machines, (2012), pp. 4165\u20134168."},{"key":"68_CR51","unstructured":"O Abdel-Hamid, H Jiang, in Proc Interspeech. Rapid and effective speaker adaptation of convolutional neural network based models for speech recognition, (2013), pp. 1248\u20131252."},{"key":"68_CR52","unstructured":"A Graves, A Mohamed, GE Hinton, in Proc ICASSP. Speech recognition with deep recurrent neural networks, (2013), pp. 6645\u20136649."},{"key":"68_CR53","unstructured":"V Peddinti, TN Sainath, S Maymon, B Ramabhadran, D Nahamoo, V Goel, in Proc ICASSP. Deep scattering spectrum with deep neural networks, (2014), pp. 210\u2013214."}],"container-title":["EURASIP Journal on Audio, Speech, and Music Processing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1186\/s13636-015-0068-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1186\/s13636-015-0068-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1186\/s13636-015-0068-3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1186\/s13636-015-0068-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,21]],"date-time":"2022-05-21T11:35:16Z","timestamp":1653132916000},"score":1,"resource":{"primary":{"URL":"https:\/\/asmp-eurasipjournals.springeropen.com\/articles\/10.1186\/s13636-015-0068-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,9,4]]},"references-count":53,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2015,12]]}},"alternative-id":["68"],"URL":"https:\/\/doi.org\/10.1186\/s13636-015-0068-3","relation":{},"ISSN":["1687-4722"],"issn-type":[{"value":"1687-4722","type":"electronic"}],"subject":[],"published":{"date-parts":[[2015,9,4]]},"article-number":"25"}}