{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T10:38:15Z","timestamp":1761129495232},"publisher-location":"Cham","reference-count":42,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319646794"},{"type":"electronic","value":"9783319646800"}],"license":[{"start":{"date-parts":[[2017,1,1]],"date-time":"2017-01-01T00:00:00Z","timestamp":1483228800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2017]]},"DOI":"10.1007\/978-3-319-64680-0_5","type":"book-chapter","created":{"date-parts":[[2017,10,31]],"date-time":"2017-10-31T04:37:26Z","timestamp":1509424646000},"page":"105-133","source":"Crossref","is-referenced-by-count":6,"title":["Raw Multichannel Processing Using Deep Neural Networks"],"prefix":"10.1007","author":[{"given":"Tara N.","family":"Sainath","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ron J.","family":"Weiss","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kevin W.","family":"Wilson","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Arun","family":"Narayanan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Michiel","family":"Bacchiani","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bo","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ehsan","family":"Variani","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Izhak","family":"Shafran","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Andrew","family":"Senior","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kean","family":"Chin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ananya","family":"Misra","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chanwoo","family":"Kim","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2017,7,26]]},"reference":[{"issue":"4","key":"5_CR1","doi-asserted-by":"crossref","first-page":"943","DOI":"10.1121\/1.382599","volume":"65","author":"J.B. Allen","year":"1979","unstructured":"Allen, J.B., Berkley, D.A.: Image method for efficiently simulation room-small acoustics. J. Acoust. Soc. Am. 65(4), 943\u2013950 (1979)","journal-title":"J. Acoust. Soc. Am."},{"key":"5_CR2","volume-title":"Microphone Array Signal Processing","author":"J. Benesty","year":"2009","unstructured":"Benesty, J., Chen, J., Huang, Y.: Microphone Array Signal Processing. Springer, Berlin (2009)"},{"key":"5_CR3","volume-title":"Scaling Learning Algorithms Towards AI","author":"Y. Bengio","year":"2007","unstructured":"Bengio, Y., Lecun, Y.: Scaling Learning Algorithms Towards AI. Large Scale Kernel Machines. MIT press, Cambridge (2007)"},{"key":"5_CR4","unstructured":"Bengio, S., Vinyals, O., Jaitly, N., Shazeer, N.: Scheduled sampling for sequence prediction with recurrent neural networks. In: Advances in Neural Information Processing Systems, pp.\u00a01171\u20131179 (2015)"},{"key":"5_CR5","volume-title":"The Fourier Transform and Its Applications","author":"R. Bracewell","year":"1999","unstructured":"Bracewell, R.: The Fourier Transform and Its Applications, 3rd edn. McGraw-Hill, New York (1999)","edition":"3"},{"key":"5_CR6","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-662-04619-7","volume-title":"Microphone Arrays: Signal Processing Techniques and Applications","author":"M. Brandstein","year":"2001","unstructured":"Brandstein, M., Ward, D.: Microphone Arrays: Signal Processing Techniques and Applications. Springer, Berlin (2001)"},{"key":"5_CR7","doi-asserted-by":"crossref","unstructured":"Chen, Z., Watanabe, S., Erdo\u011fan, H., Hershey, J.R.: Speech enhancement and recognition using multi-task learning of long short-term memory recurrent neural networks. In: Proceedings of Interspeech, pp.\u00a03274\u20133278. ISCA (2015)","DOI":"10.21437\/Interspeech.2015-659"},{"key":"5_CR8","unstructured":"Chung, J., Gulcehre, C., Cho, K., Bengio, Y.: Gated feedback recurrent neural networks. arXiv preprint. arXiv:1502.02367 (2015)"},{"key":"5_CR9","volume-title":"Large scale distributed deep networks","author":"J. Dean","year":"2012","unstructured":"Dean, J., Corrado, G., Monga, R., Chen, K., Devin, M., Le, Q., Mao, M., Ranzato, M., Senior, A., Tucker, P., Yang, K., Ng, A.: Large scale distributed deep networks. In: Proceedings of NIPS (2012)"},{"key":"5_CR10","volume-title":"Linear prediction-based dereverberation with advanced speech enhancement and recognition technologies for the REVERB challenge","author":"M. Delcroix","year":"2014","unstructured":"Delcroix, M., Yoshioka, T., Ogawa, A., Kubo, Y., Fujimoto, M., Ito, N., Kinoshita, K., Espi, M., Hori, T., Nakatani, T., Nakamura, A.: Linear prediction-based dereverberation with advanced speech enhancement and recognition technologies for the REVERB challenge. In: REVERB Workshop (2014)"},{"key":"5_CR11","doi-asserted-by":"crossref","unstructured":"Dieleman, S., Schrauwen, B.: End-to-end learning for music audio. In: 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a06964\u20136968. IEEE (2014)","DOI":"10.1109\/ICASSP.2014.6854950"},{"key":"5_CR12","doi-asserted-by":"crossref","unstructured":"Giri, R., Seltzer, M.L., Droppo, J., Yu, D.: Improving speech recognition in reverberation using a room-aware deep neural network and multi-task learning. In: Proceedings of ICASSP, pp.\u00a05014\u20135018. IEEE (2015)","DOI":"10.1109\/ICASSP.2015.7178925"},{"key":"5_CR13","volume-title":"Understanding the difficulty of training deep feedforward neural networks","author":"X. Glorot","year":"2014","unstructured":"Glorot, X., Bengio, Y.: Understanding the difficulty of training deep feedforward neural networks. In: Proceedings of AISTATS (2014)"},{"issue":"1","key":"5_CR14","doi-asserted-by":"crossref","first-page":"27","DOI":"10.1109\/TAP.1982.1142739","volume":"30","author":"L.J. Griffiths","year":"1982","unstructured":"Griffiths, L.J., Jim, C.W.: An alternative approach to linearly constrained adaptive beamforming. IEEE Trans. Antennas Propag. 30(1), 27\u201334 (1982)","journal-title":"IEEE Trans. Antennas Propag."},{"issue":"2","key":"5_CR15","doi-asserted-by":"crossref","first-page":"486","DOI":"10.1109\/TASL.2011.2163395","volume":"20","author":"T. Hain","year":"2012","unstructured":"Hain, T., Burget, L., Dines, J., Garner, P., Grezl, F., Hannani, A., Huijbregts, M., Karafiat, M., Lincoln, M., Wan, V.: Transcribing meetings with the AMIDA systems. IEEE Trans. Audio Speech Lang. Process. 20(2), 486\u2013498 (2012)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"5_CR16","volume-title":"Asynchronous stochastic optimization for sequence training of deep neural networks","author":"G. Heigold","year":"2014","unstructured":"Heigold, G., McDermott, E., Vanhoucke, V., Senior, A., Bacchiani, M.: Asynchronous stochastic optimization for sequence training of deep neural networks. In: Proceedings of ICASSP (2014)"},{"key":"5_CR17","unstructured":"Hershey, J.R., Roux, J.L., Weninger, F.: Deep unfolding: model-based inspiration of novel deep architectures. CoRR abs\/1409.2574 (2014)"},{"key":"5_CR18","volume-title":"Speech acoustic modeling from raw multichannel waveforms","author":"Y. Hoshen","year":"2015","unstructured":"Hoshen, Y., Weiss, R.J., Wilson, K.W.: Speech acoustic modeling from raw multichannel waveforms. In: Proceedings of ICASSP (2015)"},{"key":"5_CR19","volume-title":"Learning a better representation of speech soundwaves using restricted Boltzmann machines","author":"N. Jaitly","year":"2011","unstructured":"Jaitly, N., Hinton, G.: Learning a better representation of speech soundwaves using restricted Boltzmann machines. In: Proceedings of ICASSP (2011)"},{"issue":"4","key":"5_CR20","doi-asserted-by":"crossref","first-page":"320","DOI":"10.1109\/TASSP.1976.1162830","volume":"24","author":"C.H. Knapp","year":"1976","unstructured":"Knapp, C.H., Carter, G.C.: The generalized correlation method for estimation of time delay. IEEE Trans. Acoust. Speech Signal Process. 24(4), 320\u2013327 (1976)","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"key":"5_CR21","volume-title":"Neural network adaptive beamforming for robust multichannel speech recognition","author":"B. Li","year":"2016","unstructured":"Li, B., Sainath, T.N., Weiss, R.J., Wilson, K.W., Bacchiani, M.: Neural network adaptive beamforming for robust multichannel speech recognition. In: Proceedings of Interspeech (2016)"},{"key":"5_CR22","volume-title":"Using neural network front-ends on far-field multiple microphones based speech recognition","author":"Y. Liu","year":"2014","unstructured":"Liu, Y., Zhang, P., Hain, T.: Using neural network front-ends on far-field multiple microphones based speech recognition. In: Proceedings of ICASSP (2014)"},{"key":"5_CR23","volume-title":"Understanding how deep belief networks perform acoustic modelling","author":"A. Mohamed","year":"2012","unstructured":"Mohamed, A., Hinton, G., Penn, G.: Understanding how deep belief networks perform acoustic modelling. In: Proceedings of ICASSP (2012)"},{"key":"5_CR24","volume-title":"Estimating phoneme class conditional probabilities from raw speech signal using convolutional neural networks","author":"D. Palaz","year":"2014","unstructured":"Palaz, D., Collobert, R., Doss, M.: Estimating phoneme class conditional probabilities from raw speech signal using convolutional neural networks. In: Proceedings of Interspeech (2014)"},{"key":"5_CR25","volume-title":"Improvements to deep convolutional neural networks for LVCSR","author":"T.N. Sainath","year":"2013","unstructured":"Sainath, T.N., Kingsbury, B., Mohamed, A., Dahl, G., Saon, G., Soltau, H., Beran, T., Aravkin, A., Ramabhadran, B.: Improvements to deep convolutional neural networks for LVCSR. In: Proceedings of ASRU (2013)"},{"key":"5_CR26","volume-title":"Modeling time\u2013frequency patterns with LSTM vs. convolutional architectures for LVCSR tasks","author":"T.N. Sainath","year":"2016","unstructured":"Sainath, T.N., Li, B.: Modeling time\u2013frequency patterns with LSTM vs. convolutional architectures for LVCSR tasks. In: Proceedings of Interspeech (2016)"},{"key":"5_CR27","volume-title":"Low-rank matrix factorization for deep neural network training with high-dimensional output targets","author":"T.N. Sainath","year":"2013","unstructured":"Sainath, T.N., Kingsbury, B., Sindhwani, V., Arisoy, E., Ramabhadran, B.: Low-rank matrix factorization for deep neural network training with high-dimensional output targets. In: Proceedings of ICASSP (2013)"},{"key":"5_CR28","volume-title":"Convolutional, long short-term memory, fully connected deep neural networks","author":"T.N. Sainath","year":"2015","unstructured":"Sainath, T.N., Vinyals, O., Senior, A., Sak, H.: Convolutional, long short-term memory, fully connected deep neural networks. In: Proceedings of ICASSP (2015)"},{"key":"5_CR29","volume-title":"Speaker localization and microphone spacing invariant acoustic modeling from raw multichannel waveforms","author":"T.N. Sainath","year":"2015","unstructured":"Sainath, T.N., Weiss, R.J., Wilson, K.W., Narayanan, A., Bacchiani, M., Senior, A.: Speaker localization and microphone spacing invariant acoustic modeling from raw multichannel waveforms. In: Proceedings of ASRU (2015)"},{"key":"5_CR30","volume-title":"Learning the speech front-end with raw waveform CLDNNs","author":"T.N. Sainath","year":"2015","unstructured":"Sainath, T.N., Weiss, R.J., Wilson, K.W., Senior, A., Vinyals, O.: Learning the speech front-end with raw waveform CLDNNs. In: Proceedings of Interspeech (2015)"},{"key":"5_CR31","volume-title":"Reducing the computational complexity of multimicrophone acoustic models with integrated feature extraction","author":"T.N. Sainath","year":"2016","unstructured":"Sainath, T.N., Narayanan, A., Weiss, R.J., Wilson, K.W., Bacchiani, M., Shafran, I.: Reducing the computational complexity of multimicrophone acoustic models with integrated feature extraction. In: Proceedings of Interspeech (2016)"},{"key":"5_CR32","volume-title":"Factored spatial and spectral multichannel raw waveform CLDNNs","author":"T.N. Sainath","year":"2016","unstructured":"Sainath, T.N., Weiss, R.J., Wilson, K.W., Narayanan, A., Bacchiani, M.: Factored spatial and spectral multichannel raw waveform CLDNNs. In: Proceedings of ICASSP (2016)"},{"key":"5_CR33","volume-title":"Long short-term memory recurrent neural network architectures for large scale acoustic modeling","author":"H. Sak","year":"2014","unstructured":"Sak, H., Senior, A., Beaufays, F.: Long short-term memory recurrent neural network architectures for large scale acoustic modeling. In: Proceedings of Interspeech (2014)"},{"issue":"5","key":"5_CR34","doi-asserted-by":"crossref","first-page":"489","DOI":"10.1109\/TSA.2004.832988","volume":"12","author":"M. Seltzer","year":"2004","unstructured":"Seltzer, M., Raj, B., Stern, R.M.: Likelihood-maximizing beamforming for robust handsfree speech recognition. IEEE Trans. Audio Speech Lang. Process. 12(5), 489\u2013498 (2004)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"5_CR35","doi-asserted-by":"crossref","unstructured":"Stolcke, A., Anguera, X., Boakye, K., \u00c7etin, O., Janin, A., Magimai-Doss, M., Wooters, C., Zheng, J.: The SRI-ICSI Spring 2007 meeting and lecture recognition system. In: Multimodal Technologies for Perception of Humans. Lecture Notes in Computer Science, vol. 2, pp. 450\u2013463. Springer, Berlin (2008)","DOI":"10.1007\/978-3-540-68585-2_42"},{"key":"5_CR36","volume-title":"Hybrid acoustic models for distant and multichannel large vocabulary speech recognition","author":"P. Swietojanski","year":"2013","unstructured":"Swietojanski, P., Ghoshal, A., Renals, S.: Hybrid acoustic models for distant and multichannel large vocabulary speech recognition. In: Proceedings of ASRU (2013)"},{"key":"5_CR37","volume-title":"Acoustic modeling with deep neural networks using raw time signal for LVCSR","author":"Z. T\u00fcske","year":"2014","unstructured":"T\u00fcske, Z., Golik, P., Schl\u00fcter, R., Ney, H.: Acoustic modeling with deep neural networks using raw time signal for LVCSR. In: Proceedings of Interspeech (2014)"},{"key":"5_CR38","volume-title":"Complex linear projection (CLP): a discriminative approach to joint feature extraction and acoustic modeling","author":"E. Variani","year":"2016","unstructured":"Variani, E., Sainath, T.N., Shafran, I.: Complex linear projection (CLP): a discriminative approach to joint feature extraction and acoustic modeling. In: Proceedings of Interspeech (2016)"},{"issue":"2","key":"5_CR39","doi-asserted-by":"crossref","first-page":"4","DOI":"10.1109\/53.665","volume":"5","author":"B.D. Veen","year":"1988","unstructured":"Veen, B.D., Buckley, K.M.: Beamforming: a versatile approach to spatial filtering. IEEE ASSP Mag. 5(2), 4\u201324 (1988)","journal-title":"IEEE ASSP Mag."},{"key":"5_CR40","volume-title":"Deep beamforming networks for multi-channel speech recognition","author":"X. Xiao","year":"2016","unstructured":"Xiao, X., Watanabe, S., Erdogan, H., Lu, L., Hershey, J., Seltzer, M.L., Chen, G., Zhang, Y., Mandel, M., Yu, D.: Deep beamforming networks for multi-channel speech recognition. In: Proceedings of ICASSP (2016)"},{"key":"5_CR41","doi-asserted-by":"crossref","unstructured":"Xiao, X., Zhao, S., Zhong, X., Jones, D.L., Chng, E.S., Li, H.: A learning-based approach to direction of arrival estimation in noisy and reverberant environments. In: 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a02814\u20132818. IEEE (2015)","DOI":"10.1109\/ICASSP.2015.7178484"},{"key":"5_CR42","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Chuangsuwanich, E., Glass, J.R.: Extracting deep neural network bottleneck features using low-rank matrix factorization. In: ICASSP, pp.\u00a0185\u2013189 (2014)","DOI":"10.1109\/ICASSP.2014.6853583"}],"container-title":["New Era for Robust Speech Recognition"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-64680-0_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,5]],"date-time":"2022-08-05T18:20:00Z","timestamp":1659723600000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-64680-0_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017]]},"ISBN":["9783319646794","9783319646800"],"references-count":42,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-64680-0_5","relation":{},"subject":[],"published":{"date-parts":[[2017]]}}}