{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:33:45Z","timestamp":1769819625559,"version":"3.49.0"},"reference-count":105,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2019,6,11]],"date-time":"2019-06-11T00:00:00Z","timestamp":1560211200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2019,6,11]],"date-time":"2019-06-11T00:00:00Z","timestamp":1560211200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Circuits Syst Signal Process"],"published-print":{"date-parts":[[2019,8]]},"DOI":"10.1007\/s00034-019-01157-3","type":"journal-article","created":{"date-parts":[[2019,6,11]],"date-time":"2019-06-11T11:03:01Z","timestamp":1560250981000},"page":"3406-3432","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":17,"title":["A Primer on Deep Learning Architectures and Applications in Speech Processing"],"prefix":"10.1007","volume":"38","author":[{"given":"Tokunbo","family":"Ogunfunmi","sequence":"first","affiliation":[]},{"given":"Ravi Prakash","family":"Ramachandran","sequence":"additional","affiliation":[]},{"given":"Roberto","family":"Togneri","sequence":"additional","affiliation":[]},{"given":"Yuanjun","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Xianjun","family":"Xia","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,6,11]]},"reference":[{"issue":"10","key":"1157_CR1","doi-asserted-by":"publisher","first-page":"1533","DOI":"10.1109\/TASLP.2014.2339736","volume":"22","author":"O Abdel-Hamid","year":"2014","unstructured":"O. Abdel-Hamid, A.R. Mohamed, H. Jiang, L. Deng, G. Penn, D. Yu, Convolutional neural networks for speech recognition. IEEE\/ACM Trans. Audio Speech Lang. Process. 22(10), 1533\u20131545 (2014)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"1157_CR2","unstructured":"F. Abuzaid., Optimizing cpu performance for convolutional neural networks. Online. Available: http:\/\/cs231n.stanford.edu\/reports\/2015\/pdfs\/fabuzaid final report.pdf"},{"key":"1157_CR3","doi-asserted-by":"crossref","unstructured":"M. Alwani, H. Chen, M. Ferdman, P. Milder, Fused-layer CNN accelerators, in The 49th Annual IEEE\/ACM International Symposium on Microarchitecture (IEEE Press, New Jersey, 2016)","DOI":"10.1109\/MICRO.2016.7783725"},{"key":"1157_CR4","unstructured":"P. Angelov, A. Sperduti, Challenges in deep learning, in Proceedings of ESANN (2016), pp. 489\u2013495"},{"key":"1157_CR5","doi-asserted-by":"crossref","unstructured":"A. Ansari, K. Gunnam, T. Ogunfunmi, An efficient reconfigurable hardware accelerator for convolutional neural networks, in 51st Asilomar Conference on Signals, Systems, and Computers (IEEE, 2017), pp. 1337\u20131341","DOI":"10.1109\/ACSSC.2017.8335571"},{"key":"1157_CR6","unstructured":"A. Ansari, T. Ogunfunmi, An Efficient Network Agnostic Architecture Design and Analysis for Convolutional Neural Networks. submitted to the IEEE JETCAS, Special Issue on Customized sub-systems and circuits for deep learning (2019)"},{"key":"1157_CR7","first-page":"2206","volume":"7","author":"A Bhandare","year":"2016","unstructured":"A. Bhandare, M. Bhide, P. Gokhale, R. Chandavarkar, Applications of convolutional neural networks. Int. J. Comput. Sci. Inf. Technol. 7, 2206\u20132215 (2016)","journal-title":"Int. J. Comput. Sci. Inf. Technol."},{"key":"1157_CR8","volume-title":"Pattern Recognition and Machine Learning","author":"CM Bishop","year":"2006","unstructured":"C.M. Bishop, Pattern Recognition and Machine Learning (Springer, Berlin, 2006)"},{"key":"1157_CR9","unstructured":"S. B\u00f6ck, M. Schedl, Polyphonic piano note transcription with recurrent neural networks, in 2012 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2012), pp. 121\u2013124"},{"issue":"4","key":"1157_CR10","doi-asserted-by":"publisher","first-page":"205","DOI":"10.1109\/TSA.2002.1011533","volume":"10","author":"WM Campbell","year":"2002","unstructured":"W.M. Campbell, K.T. Assaleh, C.C. Broun, Speaker recognition with polynomial classifiers. IEEE Trans. Speech Audio Process. 10(4), 205\u2013212 (2002)","journal-title":"IEEE Trans. Speech Audio Process."},{"issue":"3","key":"1157_CR11","doi-asserted-by":"publisher","first-page":"247","DOI":"10.1145\/1816038.1815993","volume":"38","author":"S Chakradhar","year":"2010","unstructured":"S. Chakradhar, M. Sankaradas, V. Jakkula, S. Cadambi, A dynamically configurable coprocessor for convolutional neural networks. ACM SIGARCH Comput. Archit. News 38(3), 247\u2013257 (2010)","journal-title":"ACM SIGARCH Comput. Archit. News"},{"issue":"1","key":"1157_CR12","doi-asserted-by":"publisher","first-page":"59","DOI":"10.1109\/89.365380","volume":"3","author":"JH Chen","year":"1995","unstructured":"J.H. Chen, A. Gersho, Adaptive postfiltering for quality enhancement of coded speech. IEEE Trans. Speech Audio Process. 3(1), 59\u201371 (1995)","journal-title":"IEEE Trans. Speech Audio Process."},{"issue":"3","key":"1157_CR13","doi-asserted-by":"publisher","first-page":"367","DOI":"10.1145\/3007787.3001177","volume":"44","author":"YH Chen","year":"2016","unstructured":"Y.H. Chen, J. Emer, V. Sze, Eyeriss: a spatial architecture for energy-efficient dataflow for convolutional neural networks. ACM SIGARCH Comput. Archit. News 44(3), 367\u2013379 (2016)","journal-title":"ACM SIGARCH Comput. Archit. News"},{"issue":"1","key":"1157_CR14","doi-asserted-by":"publisher","first-page":"127","DOI":"10.1109\/JSSC.2016.2616357","volume":"52","author":"YH Chen","year":"2017","unstructured":"Y.H. Chen, T. Krishna, J.S. Emer, V. Sze, Eyeriss: an energy-efficient reconfigurable accelerator for deep convolutional neural networks. IEEE J. Solid-State Circuits 52(1), 127\u2013138 (2017)","journal-title":"IEEE J. Solid-State Circuits"},{"key":"1157_CR15","unstructured":"S. Chetlur, C. Woolley, P. Vandermersch, J. Cohen, J. Tran, B. Catanzaro, E. Shelhamer, cudnn: efficient primitives for deep learning (2014). arXiv preprint arXiv:1410.0759"},{"key":"1157_CR16","unstructured":"K. Cho, B. Van Merri\u00ebnboer, C. Gulcehre, D. Bahdanau, F. Bougares, H. Schwenk, Y. Bengio, Learning phrase representations using RNN encoder-decoder for statistical machine translation, in Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) (2014), pp. 1724\u20131734"},{"key":"1157_CR17","unstructured":"J. Chung, C. Gulcehre, K. Cho, Y. Bengio, Empirical evaluation of gated recurrent neural networks on sequence modeling (2014). arXiv preprint. arXiv:1412.3555"},{"key":"1157_CR18","unstructured":"D.A. Clevert, T. Unterthiner, S. Hochreiter, Fast and accurate deep network learning by exponential linear units (ELUS). arXiv preprint arXiv:1511.07289"},{"key":"1157_CR19","doi-asserted-by":"crossref","unstructured":"J. Cong, Z. Fang, M. Lo, H. Wang, J. Xu, S. Zhang, Understanding performance differences of FPGAs and GPUs, in 2018 IEEE 26th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM). (IEEE, 2018), pp. 93\u201396","DOI":"10.1109\/FCCM.2018.00023"},{"issue":"4","key":"1157_CR20","doi-asserted-by":"publisher","first-page":"788","DOI":"10.1109\/TASL.2010.2064307","volume":"19","author":"N Dehak","year":"2011","unstructured":"N. Dehak, P.J. Kenny, R. Dehak, P. Dumouchel, P. Ouellet, Front-end factor analysis for speaker verification. IEEE Trans. Audio Speech Lang. Process. 19(4), 788\u2013798 (2011)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"1157_CR21","doi-asserted-by":"publisher","first-page":"27","DOI":"10.1142\/9789814656535_0002","volume-title":"Handbook of Pattern Recognition and Computer Vision","author":"L Deng","year":"2016","unstructured":"L. Deng, N. Jaitly, Deep discriminative and generative models for speech pattern recognition, in Handbook of Pattern Recognition and Computer Vision, ed. by C.H. Chen (World Scientific, Singapore, 2016), pp. 27\u201352"},{"key":"1157_CR22","unstructured":"R. Dey, F.M. Salemt, Gate-variants of Gated Recurrent Unit (GRU) neural networks, in 60th International Midwest Symposium on Circuits and Systems (MWSCAS) (2017), pp. 1597\u20131600"},{"key":"1157_CR23","unstructured":"J.S. Edwards, R.P. Ramachandran, U. Thayasivam, Robust speaker verification with a two classifier format and feature enhancement, in IEEE international symposium on circuits and systems (ISCAS) (2017), pp. 1\u20134"},{"issue":"3","key":"1157_CR24","doi-asserted-by":"publisher","first-page":"572","DOI":"10.1016\/j.patcog.2010.09.020","volume":"44","author":"M El Ayadi","year":"2011","unstructured":"M. El Ayadi, M.S. Kamel, F. Karray, Survey on speech emotion recognition: features, classification schemes, and databases. Pattern Recognit. 44(3), 572\u2013587 (2011)","journal-title":"Pattern Recognit."},{"key":"1157_CR25","first-page":"625","volume":"11","author":"D Erhan","year":"2010","unstructured":"D. Erhan, Y. Bengio, A. Courville, P.A. Manzagol, P. Vincent, S. Bengio, Why does unsupervised pre-training help deep learning? J. Mach. Learn. Res. 11, 625\u2013660 (2010)","journal-title":"J. Mach. Learn. Res."},{"issue":"2","key":"1157_CR26","doi-asserted-by":"publisher","first-page":"62","DOI":"10.1109\/MCAS.2011.941080","volume":"11","author":"A Fazel","year":"2011","unstructured":"A. Fazel, S. Chakrabartty, An overview of statistical pattern recognition techniques for speaker verification. IEEE Circuits Syst. Mag. 11(2), 62\u201381 (2011)","journal-title":"IEEE Circuits Syst. Mag."},{"key":"1157_CR27","unstructured":"J. Fowers, G. Brown, P. Cooke, G. Stitt, A performance and energy comparison of FPGAs, GPUs, and multicores for sliding-window applications, in Proceedings of the ACM\/SIGDA International Symposium on Field Programmable Gate Arrays (2012), pp. 47\u201356"},{"key":"1157_CR28","unstructured":"S.W. Fu, Y. Tsao, X. Lu, H. Kawai, Raw waveform-based speech enhancement by fully convolutional networks (2017). arXiv preprint arXiv:1703.02205"},{"key":"1157_CR29","unstructured":"S.W. Fu, Y. Tsao, X. Lu, SNR-aware convolutional neural network modeling for speech enhancement, in Interspeech (2016), pp. 3768\u20133772"},{"key":"1157_CR30","unstructured":"F.A. Gers, J. Schmidhuber, F. Cummins, Learning to forget: continual prediction with LSTM, in 9th International Conference on Artificial Neural Networks (ICANN) (1999), pp. 850\u2013855"},{"issue":"3","key":"1157_CR31","doi-asserted-by":"publisher","first-page":"600","DOI":"10.1109\/TASL.2010.2052803","volume":"19","author":"PK Ghosh","year":"2011","unstructured":"P.K. Ghosh, A. Tsiartas, S. Narayanan, Robust voice activity detection using long-term signal variability. IEEE Trans. Audio Speech Lang. Process. 19(3), 600\u2013613 (2011)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"1157_CR32","unstructured":"X. Glorot, A. Bordes, Y. Bengio, Deep sparse rectifier neural networks, in Proceedings of the 14th International Conference on Artificial Intelligence and Statistics (2011), pp. 315\u2013323"},{"key":"1157_CR33","volume-title":"Deep Learning","author":"I Goodfellow","year":"2016","unstructured":"I. Goodfellow, Y. Bengio, A. Courville, Y. Bengio, Deep Learning (MIT Press, Cambridge, 2016)"},{"key":"1157_CR34","doi-asserted-by":"crossref","unstructured":"S. Han, B. Dally, Efficient methods and hardware for deep learning. University Lecture (2017)","DOI":"10.1145\/3149166.3149168"},{"key":"1157_CR35","unstructured":"S. Han, X. Liu, H. Mao, J. Pu, A. Pedram, M.A. Horowitz, W.J. Dally, EIE: efficient inference engine on compressed deep neural network, in ACM\/IEEE 43rd Annual International Symposium on Computer Architecture (ISCA) (2016), pp. 243\u2013254"},{"key":"1157_CR36","unstructured":"K. He, X. Zhang, S. Ren, J. Sun, Deep residual learning for image recognition, in IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016), pp. 770\u2013778"},{"issue":"7","key":"1157_CR37","doi-asserted-by":"publisher","first-page":"1527","DOI":"10.1162\/neco.2006.18.7.1527","volume":"18","author":"GE Hinton","year":"2006","unstructured":"G.E. Hinton, S. Osindero, Y.W. Teh, A fast learning algorithm for deep belief nets. Neural Comput. 18(7), 1527\u20131554 (2006)","journal-title":"Neural Comput."},{"issue":"5786","key":"1157_CR38","doi-asserted-by":"publisher","first-page":"504","DOI":"10.1126\/science.1127647","volume":"313","author":"GE Hinton","year":"2006","unstructured":"G.E. Hinton, R.R. Salakhutdinov, Reducing the dimensionality of data with neural networks. Science 313(5786), 504\u2013507 (2006)","journal-title":"Science"},{"issue":"8","key":"1157_CR39","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"S. Hochreiter, J. Schmidhuber, Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"1157_CR40","unstructured":"C.W. Huang, S. Narayanan, Characterizing types of convolution in deep convolutional recurrent neural networks for robust speech emotion recognition (2017). arXiv preprint arXiv:1706.02901"},{"key":"1157_CR41","unstructured":"N.P. Jouppi, C. Young, N. Patil, D. Patterson, G. Agrawal, R. Bajwa, R. Boyle, et al., In-datacenter performance analysis of a tensor processing unit, in 2017 ACM\/IEEE 44th Annual International Symposium on Computer Architecture (ISCA) (2017), pp. 1\u201312"},{"issue":"4","key":"1157_CR42","doi-asserted-by":"publisher","first-page":"1435","DOI":"10.1109\/TASL.2006.881693","volume":"15","author":"P Kenny","year":"2007","unstructured":"P. Kenny, G. Boulianne, P. Ouellet, P. Dumouchel, Joint factor analysis versus eigenchannels in speaker recognition. IEEE Trans. Audio Speech Lang. Process. 15(4), 1435\u20131447 (2007)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"issue":"8","key":"1157_CR43","doi-asserted-by":"publisher","first-page":"591","DOI":"10.1109\/TSA.2002.804302","volume":"10","author":"HK Kim","year":"2002","unstructured":"H.K. Kim, R.V. Cox, R.C. Rose, Performance improvement of a bitstream-based front-end for wireless speech recognition in adverse environments. IEEE Trans. Speech Audio Process. 10(8), 591\u2013604 (2002)","journal-title":"IEEE Trans. Speech Audio Process."},{"key":"1157_CR44","unstructured":"D.P. Kingma, M. Welling, Auto-encoding variational bayes (2013). arXiv preprint. arXiv:1312.6114"},{"issue":"1","key":"1157_CR45","doi-asserted-by":"publisher","first-page":"12","DOI":"10.1016\/j.specom.2009.08.009","volume":"52","author":"T Kinnunen","year":"2010","unstructured":"T. Kinnunen, H. Li, An overview of text-independent speaker recognition: from features to supervectors. Speech Commun. 52(1), 12\u201340 (2010)","journal-title":"Speech Commun."},{"key":"1157_CR46","unstructured":"P.W. Koh, P. Liang, Understanding black-box predictions via influence functions, in Proceedings of the 34th International Conference on Machine Learning-Volume 70 (JMLR. org, 2017), pp. 1885\u20131894"},{"key":"1157_CR47","unstructured":"A. Krizhevsky, I. Sutskever, G. E. Hinton, Imagenet classification with deep convolutional neural networks, in Advances in Neural Information Processing Systems (2012), pp. 1097\u20131105"},{"key":"1157_CR48","doi-asserted-by":"crossref","unstructured":"H.T. Kung, B. McDanel, S.Q. Zhang, Mapping systolic arrays onto 3D circuit structures: accelerating convolutional neural network inference, in IEEE Workshop on Signal Processing Systems (2018)","DOI":"10.1109\/SiPS.2018.8598454"},{"key":"1157_CR49","unstructured":"G. Lacey, G.W. Taylor, S. Areibi, Deep learning on fpgas: past, present, and future (2016). arXiv preprint arXiv:1602.04283"},{"issue":"7553","key":"1157_CR50","doi-asserted-by":"publisher","first-page":"436","DOI":"10.1038\/nature14539","volume":"521","author":"Y LeCun","year":"2015","unstructured":"Y. LeCun, Y. Bengio, G. Hinton, Deep learning. Nature 521(7553), 436 (2015)","journal-title":"Nature"},{"issue":"11","key":"1157_CR51","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y LeCun","year":"1998","unstructured":"Y. LeCun, L. Bottou, Y. Bengio, P. Haffner, Gradient-based learning applied to document recognition. Proc. IEEE 86(11), 2278\u20132324 (1998)","journal-title":"Proc. IEEE"},{"key":"1157_CR52","unstructured":"Z. Li, J. Eichel, A. Mishra, A. Achkar, K. Naik, A CPU-based algorithm for traffic optimization based on sparse convolutional neural networks, in Electrical and Computer Engineering (CCECE), 2017 IEEE 30th Canadian Conference IEEE (2017), pp. 1\u20135"},{"key":"1157_CR53","unstructured":"M. Lin, Q. Chen, S. Yan, Network in network (2013). arXiv preprint. arXiv:1312.4400"},{"key":"1157_CR54","unstructured":"Z.C. Lipton, J. Berkowitz, C. Elkan, A critical review of recurrent neural networks for sequence learning (2015). arXiv preprint. arXiv:1506.00019"},{"key":"1157_CR55","doi-asserted-by":"publisher","DOI":"10.1201\/9781420015836","volume-title":"Speech Enhancement: Theory and Practice","author":"PC Loizou","year":"2007","unstructured":"P.C. Loizou, Speech Enhancement: Theory and Practice (CRC Press, Boca Raton, 2007)"},{"key":"1157_CR56","unstructured":"A. Makhzani, B. Frey, K-sparse autoencoders (2013). arXiv preprint. arXiv:1312.5663"},{"issue":"1","key":"1157_CR57","doi-asserted-by":"publisher","first-page":"108","DOI":"10.1109\/TASL.2011.2158309","volume":"20","author":"T May","year":"2012","unstructured":"T. May, S. Van De Par, A. Kohlrausch, Noise-robust speaker recognition combining missing data techniques and universal background modeling. IEEE Trans. Audio, SpeechLang. Process. 20(1), 108\u2013121 (2012)","journal-title":"IEEE Trans. Audio, SpeechLang. Process."},{"key":"1157_CR58","doi-asserted-by":"crossref","unstructured":"A. McCree, Reducing speech coding distortion for speaker identification, in Ninth International Conference on Spoken Language Processing (2006)","DOI":"10.21437\/Interspeech.2006-176"},{"issue":"4","key":"1157_CR59","doi-asserted-by":"publisher","first-page":"115","DOI":"10.1007\/BF02478259","volume":"5","author":"WS McCulloch","year":"1943","unstructured":"W.S. McCulloch, W. Pitts, A logical calculus of the ideas immanent in nervous activity. Bull. Math. Biophys. 5(4), 115\u2013133 (1943)","journal-title":"Bull. Math. Biophys."},{"key":"1157_CR60","doi-asserted-by":"crossref","unstructured":"M. McLaren, Y. Lei, N. Scheffer, L. Ferrer, Application of convolutional neural networks to speaker recognition in noisy conditions, in Fifteenth Annual Conference of the International Speech Communication Association (2014)","DOI":"10.21437\/Interspeech.2014-172"},{"issue":"5","key":"1157_CR61","doi-asserted-by":"publisher","first-page":"1711","DOI":"10.1109\/TASL.2007.899278","volume":"15","author":"J Ming","year":"2007","unstructured":"J. Ming, T.J. Hazen, J.R. Glass, D.A. Reynolds, Robust speaker recognition in noisy conditions. IEEE Trans. Audio Speech Lang. Process. 15(5), 1711\u20131723 (2007)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"1157_CR62","doi-asserted-by":"crossref","unstructured":"V. Mitra, H. Franco, Time-frequency convolutional networks for robust speech recognition. in 2015 IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU) (IEEE, 2015), pp. 317\u2013323","DOI":"10.1109\/ASRU.2015.7404811"},{"key":"1157_CR63","unstructured":"H. Muckenhirn, M.M. Doss, S. Marcell, Towards directly modeling raw speech signal for speaker verification using CNNs, in 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2018), pp. 4884\u20134888"},{"key":"1157_CR64","unstructured":"R.W. Mudrowsky, R.P. Ramachandran, U. Thayasivam, S.S. Shetty, Robust speaker recognition in the presence of speech coding distortion for remote access applications, in Proceedings of the International Conference on Data Mining (DMIN) (2016), p. 176"},{"key":"1157_CR65","unstructured":"C. Murphy, Y. Fu, Xilinx all programmable devices: a superior platform for compute-intensive systems. Xilinx White Paper (2017)"},{"key":"1157_CR66","unstructured":"V. Nair, G.E. Hinton, Rectified linear units improve restricted Boltzmann machines, in Proceedings of the 27th International Conference on Machine Learning (ICML) (2010), pp. 807\u2013814"},{"key":"1157_CR67","doi-asserted-by":"crossref","unstructured":"E. Nurvitadhi, J. Sim, D. Sheffield, A. Mishra, S. Krishnan, D. Marr, Accelerating recurrent neural networks in analytics servers: comparison of FPGA, CPU, GPU, and ASIC, in 2016 26th International Conference on Field Programmable Logic and Applications (FPL) (IEEE, 2016), pp. 1\u20134","DOI":"10.1109\/FPL.2016.7577314"},{"key":"1157_CR68","doi-asserted-by":"crossref","unstructured":"E. Nurvitadhi, G. Venkatesh, J. Sim, D. Marr, R. Huang, J. Ong Gee Hock, G. Boudoukh, et al., Can FPGAs beat GPUs in accelerating next-generation deep neural networks? in Proceedings of the 2017 ACM\/SIGDA International Symposium on Field-Programmable Gate Arrays (ACM, 2017), pp. 5\u201314","DOI":"10.1145\/3020078.3021740"},{"issue":"2","key":"1157_CR69","doi-asserted-by":"publisher","first-page":"335","DOI":"10.1007\/s11265-016-1200-z","volume":"89","author":"R Ondusko","year":"2017","unstructured":"R. Ondusko, M. Marbach, R.P. Ramachandran, L.M. Head, Blind signal-to-noise ratio estimation of speech based on vector quantizer classifiers and decision level fusion. J. Signal Process. Syst. 89(2), 335\u2013345 (2017)","journal-title":"J. Signal Process. Syst."},{"issue":"11","key":"1157_CR70","first-page":"1","volume":"2","author":"K Ovtcharov","year":"2015","unstructured":"K. Ovtcharov, O. Ruwase, J.Y. Kim, J. Fowers, K. Strauss, E.S. Chung, Accelerating deep convolutional neural networks using specialized hardware. Microsoft Res. Whitepaper 2(11), 1\u20134 (2015)","journal-title":"Microsoft Res. Whitepaper"},{"issue":"6","key":"1157_CR71","doi-asserted-by":"publisher","first-page":"1291","DOI":"10.1109\/TASLP.2017.2690575","volume":"25","author":"G Parascandolo","year":"2017","unstructured":"G. Parascandolo, T. Heittola, H. Huttunen, T. Virtanen, Convolutional recurrent neural networks for polyphonic sound event detection. IEEE\/ACM Trans. Audio Speech Lang. Process. 25(6), 1291\u20131303 (2017)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"issue":"3","key":"1157_CR72","doi-asserted-by":"publisher","first-page":"45","DOI":"10.1109\/MCAS.2016.2583681","volume":"16","author":"M Parchami","year":"2016","unstructured":"M. Parchami, W.P. Zhu, B. Champagne, E. Plourde, Recent developments in speech enhancement in the short-time Fourier transform domain. IEEE Circuits Syst. Mag. 16(3), 45\u201377 (2016)","journal-title":"IEEE Circuits Syst. Mag."},{"key":"1157_CR73","unstructured":"C. Poultney, S. Chopra, Y.L. Cun, Efficient learning of sparse representations with an energy-based model, in Advances in Neural Information Processing Systems (2007), pp. 1137\u20131144"},{"issue":"12","key":"1157_CR74","doi-asserted-by":"publisher","first-page":"2263","DOI":"10.1109\/TASLP.2016.2602884","volume":"24","author":"Y Qian","year":"2016","unstructured":"Y. Qian, M. Bi, T. Tan, K. Yu, Very deep convolutional neural networks for noise robust speech recognition. IEEE\/ACM Trans. Audio Speech Lang. Process. 24(12), 2263\u20132276 (2016)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"issue":"2","key":"1157_CR75","doi-asserted-by":"publisher","first-page":"364","DOI":"10.1109\/49.613","volume":"6","author":"V Ramamoorthy","year":"1988","unstructured":"V. Ramamoorthy, N.S. Jayant, R.V. Cox, M.M. Sondhi, Enhancement of ADPCM speech coding with backward-adaptive algorithms for postfiltering and noise feedback. IEEE J. Select. Areas Commun. 6(2), 364\u2013382 (1988)","journal-title":"IEEE J. Select. Areas Commun."},{"key":"1157_CR76","doi-asserted-by":"crossref","unstructured":"S. Rifai, P. Vincent, X. Muller, X. Glorot, Y. Bengio, Contractive auto-encoders: explicit invariance during feature extraction, in Proceedings of the 28th International Conference on International Conference on Machine Learning (Omnipress, 2011), pp. 833\u2013840","DOI":"10.1007\/978-3-642-23783-6_41"},{"issue":"6","key":"1157_CR77","doi-asserted-by":"publisher","first-page":"386","DOI":"10.1037\/h0042519","volume":"65","author":"F Rosenblatt","year":"1958","unstructured":"F. Rosenblatt, The perceptron: a probabilistic model for information storage and organization in the brain. Psychol. Rev. 65(6), 386 (1958)","journal-title":"Psychol. Rev."},{"key":"1157_CR78","doi-asserted-by":"crossref","unstructured":"D.E. Rumelhart, G.E. Hinton, R.J. Williams, Learning internal representations by error propagation (No. ICS-8506). California Univ San Diego La Jolla Inst for Cognitive Science (1985)","DOI":"10.21236\/ADA164453"},{"issue":"3","key":"1157_CR79","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"O. Russakovsky, J. Deng, H. Su, J. Krause, S. Satheesh, S. Ma, A.C. Berg, Imagenet large scale visual recognition challenge. Int. J. Comput. Vis. 115(3), 211\u2013252 (2015)","journal-title":"Int. J. Comput. Vis."},{"key":"1157_CR80","unstructured":"S. Sabour, N. Frosst, G.E. Hinton, Dynamic routing between capsules, in Advances in Neural Information Processing Systems (2017), pp. 3856\u20133866"},{"key":"1157_CR81","doi-asserted-by":"crossref","unstructured":"T.N. Sainath, R.J. Weiss, A. Senior, K.W. Wilson, O. Vinyals, Learning the speech front-end with raw waveform CLDNNs, in Sixteenth Annual Conference of the International Speech Communication Association (2015)","DOI":"10.21437\/Interspeech.2015-1"},{"key":"1157_CR82","doi-asserted-by":"publisher","first-page":"85","DOI":"10.1016\/j.neunet.2014.09.003","volume":"61","author":"J Schmidhuber","year":"2015","unstructured":"J. Schmidhuber, Deep learning in neural networks: an overview. Neural Netw. 61, 85\u2013117 (2015)","journal-title":"Neural Netw."},{"key":"1157_CR83","unstructured":"Y. Shen, M. Ferdman, P. Milder, Maximizing CNN accelerator efficiency through resource partitioning, in ACM\/IEEE 44th Annual International Symposium on Computer Architecture (ISCA) (2017), pp. 535\u2013547"},{"key":"1157_CR84","unstructured":"K. Simonyan, A. Zisserman, Very deep convolutional networks for large-scale image recognition (2014). arXiv preprint. arXiv:1409.1556"},{"issue":"2","key":"1157_CR85","doi-asserted-by":"publisher","first-page":"8","DOI":"10.1109\/MCAS.2011.941078","volume":"11","author":"BY Smolenski","year":"2011","unstructured":"B.Y. Smolenski, R.P. Ramachandran, Usable speech processing: a filterless approach in the presence of interference. IEEE Circuits Syst. Mag. 11(2), 8\u201322 (2011)","journal-title":"IEEE Circuits Syst. Mag."},{"issue":"7","key":"1157_CR86","doi-asserted-by":"publisher","first-page":"1415","DOI":"10.1109\/TASL.2013.2253096","volume":"21","author":"BV Srinivasan","year":"2013","unstructured":"B.V. Srinivasan, Y. Luo, D. Garcia-Romero, D.N. Zotkin, R.A. Duraiswami, symmetric kernel partial least squares framework for speaker recognition. IEEE Trans. Audio Speech Lang. Process. 21(7), 1415\u20131423 (2013)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"issue":"3","key":"1157_CR87","doi-asserted-by":"publisher","first-page":"65","DOI":"10.1145\/3190618","volume":"51","author":"K Sundararajan","year":"2018","unstructured":"K. Sundararajan, D.L. Woodard, Deep learning for biometrics: a survey. ACM Comput. Surv. (CSUR) 51(3), 65 (2018)","journal-title":"ACM Comput. Surv. (CSUR)"},{"issue":"12","key":"1157_CR88","doi-asserted-by":"publisher","first-page":"2295","DOI":"10.1109\/JPROC.2017.2761740","volume":"105","author":"V Sze","year":"2017","unstructured":"V. Sze, Y.H. Chen, T.J. Yang, J.S. Emer, Efficient processing of deep neural networks: a tutorial and survey. Proc. IEEE 105(12), 2295\u20132329 (2017)","journal-title":"Proc. IEEE"},{"key":"1157_CR89","unstructured":"C. Szegedy, W. Liu, Y. Jia, P. Sermanet, S. Reed, D. Anguelov, D. Erhan, V. Vanhoucke, A. Rabinovich, Going deeper with convolutions, in IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015), pp. 1\u20139"},{"issue":"2","key":"1157_CR90","doi-asserted-by":"publisher","first-page":"23","DOI":"10.1109\/MCAS.2011.941079","volume":"11","author":"R Togneri","year":"2011","unstructured":"R. Togneri, D. Pullella, An overview of speaker identification: accuracy and robustness issues. IEEE Circuits Syst. Mag. 11(2), 23\u201361 (2011)","journal-title":"IEEE Circuits Syst. Mag."},{"key":"1157_CR91","unstructured":"G. Trigeorgis, F. Ringeval, R. Brueckner, E. Marchi, M.A. Nicolaou, B. Schuller, S. Zafeiriou, Adieu features? End-to-end speech emotion recognition using a deep convolutional recurrent network, in 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2016) pp. 5200\u20135204"},{"issue":"10","key":"1157_CR92","doi-asserted-by":"publisher","first-page":"1294","DOI":"10.1016\/j.specom.2006.06.006","volume":"48","author":"Z Tufekci","year":"2006","unstructured":"Z. Tufekci, J.N. Gowdy, S. Gurbuz, E. Patterson, Applied mel-frequency discrete wavelet coefficients and parallel model compensation for noise-robust speech recognition. Speech Commun. 48(10), 1294\u20131307 (2006)","journal-title":"Speech Commun."},{"key":"1157_CR93","unstructured":"V. Vanhoucke, A. Senior, M.Z. Mao, Improving the speed of neural networks on CPUs, in Proceedings of Deep Learning and Unsupervised Feature Learning NIPS Workshop (2011)"},{"key":"1157_CR94","unstructured":"P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol, Extracting and composing robust features with denoising autoencoders, in ACM Proceedings of the 25th International Conference on Machine Learning(2008), pp. 1096\u20131103"},{"key":"1157_CR95","first-page":"3371","volume":"11","author":"P Vincent","year":"2010","unstructured":"P. Vincent, H. Larochelle, I. Lajoie, Y. Bengio, P.A. Manzagol, Stacked denoising autoencoders: learning useful representations in a deep network with a local denoising criterion. J. Mach. Learn. Res. 11, 3371\u20133408 (2010)","journal-title":"J. Mach. Learn. Res."},{"issue":"1","key":"1157_CR96","doi-asserted-by":"publisher","first-page":"196","DOI":"10.1109\/TASL.2010.2045800","volume":"19","author":"N Wang","year":"2011","unstructured":"N. Wang, P.C. Ching, N. Zheng, T. Lee, Robust speaker recognition using denoised vocal source and vocal tract features. IEEE Trans. Audio Speech Lang. Process. 19(1), 196\u2013205 (2011)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"1157_CR97","unstructured":"Y. Wang, L. Neves, F. Metze, Audio-based multimedia event detection using deep recurrent neural networks, in 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2016), pp. 2742\u20132746"},{"key":"1157_CR98","doi-asserted-by":"crossref","unstructured":"F. Weninger, H. Erdogan, S. Watanabe, E. Vincent, J. Le Roux, J.R. Hershey, B. Schuller, Speech enhancement with LSTM recurrent neural networks and its application to noise-robust ASR, in International Conference on Latent Variable Analysis and Signal Separation (Springer, Cham, 2015), pp. 91\u201399","DOI":"10.1007\/978-3-319-22482-4_11"},{"key":"1157_CR99","unstructured":"P. Werbos, Beyond regression: new tools for prediction and analysis in the behavioral sciences. PhD thesis, Harvard University (1974)"},{"issue":"1","key":"1157_CR100","doi-asserted-by":"publisher","first-page":"7","DOI":"10.1109\/TASLP.2014.2364452","volume":"23","author":"Y Xu","year":"2015","unstructured":"Y. Xu, J. Du, L.R. Dai, C.H. Lee, A regression approach to speech enhancement based on deep neural networks. IEEE\/ACM Trans. Audio Speech Lang. Process. (TASLP) 23(1), 7\u201319 (2015)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process. (TASLP)"},{"key":"1157_CR101","unstructured":"N. Yang, R. Muraleedharan, J. Kohl, I. Demirkol, W. Heinzelman, M. Sturge-Apple, Speech-based emotion classification using multiclass SVM with hybrid kernel and thresholding fusion. In spoken language technology workshop (slt) (2012), pp. 455\u2013460"},{"key":"1157_CR102","unstructured":"R. Zazo Candil, T.N. Sainath, G. Simko, C. Parada, Feature learning with raw-waveform CLDNNs for voice activity detection, in Interspeech (2016), pp. 3668\u20133672"},{"key":"1157_CR103","doi-asserted-by":"crossref","unstructured":"M.D. Zeiler, R. Fergus, Visualizing and understanding convolutional networks, in European Conference on Computer Vision (Springer, Cham, 2014), pp. 818\u2013833","DOI":"10.1007\/978-3-319-10590-1_53"},{"key":"1157_CR104","doi-asserted-by":"crossref","unstructured":"C. Zhang, P. Li, G. Sun, Y. Guan, B. Xiao, J. Cong, Optimizing fpga-based accelerator design for deep convolutional neural networks, in Proceedings of the 2015 ACM\/SIGDA International Symposium on Field-Programmable Gate Arrays (ACM, 2015), pp. 161\u2013170","DOI":"10.1145\/2684746.2689060"},{"key":"1157_CR105","unstructured":"Z. Zhao, H. Liu, T. Fingscheidt, Convolutional neural networks to enhance coded speech (2018). arXiv preprint arXiv:1806.09411"}],"container-title":["Circuits, Systems, and Signal Processing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00034-019-01157-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s00034-019-01157-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00034-019-01157-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,9,20]],"date-time":"2022-09-20T09:23:21Z","timestamp":1663665801000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s00034-019-01157-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,6,11]]},"references-count":105,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2019,8]]}},"alternative-id":["1157"],"URL":"https:\/\/doi.org\/10.1007\/s00034-019-01157-3","relation":{},"ISSN":["0278-081X","1531-5878"],"issn-type":[{"value":"0278-081X","type":"print"},{"value":"1531-5878","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019,6,11]]},"assertion":[{"value":"22 December 2018","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 May 2019","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 May 2019","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 June 2019","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}