{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,6]],"date-time":"2025-07-06T15:10:05Z","timestamp":1751814605881,"version":"3.41.0"},"publisher-location":"Cham","reference-count":33,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319995786"},{"type":"electronic","value":"9783319995793"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-319-99579-3_71","type":"book-chapter","created":{"date-parts":[[2018,8,24]],"date-time":"2018-08-24T07:36:09Z","timestamp":1535096169000},"page":"697-706","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["A Perceptually Inspired Data Augmentation Method for Noise Robust CNN Acoustic Models"],"prefix":"10.1007","author":[{"given":"L\u00e1szl\u00f3","family":"T\u00f3th","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gy\u00f6rgy","family":"Kov\u00e1cs","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dirk","family":"Van Compernolle","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2018,8,25]]},"reference":[{"key":"71_CR1","doi-asserted-by":"crossref","unstructured":"Abdel-Hamid, O., Mohamed, A., Jiang, H., Penn, G.: Applying convolutional neural network concepts to hybrid NN-HMM model for speech recognition. In: Proceedings of ICASSP, pp. 4277\u20134280 (2012)","DOI":"10.1109\/ICASSP.2012.6288864"},{"key":"71_CR2","doi-asserted-by":"publisher","first-page":"147","DOI":"10.1016\/j.dsp.2014.03.001","volume":"29","author":"MJ Alam","year":"2014","unstructured":"Alam, M.J., Kenny, P., O\u2019Shaughnessy, D.: Robust feature extraction based on an asymmetric level-dependent auditory filterbank and a subband spectrum enhancement technique. Digital Signal Process. 29, 147\u2013157 (2014)","journal-title":"Digital Signal Process."},{"issue":"4","key":"71_CR3","doi-asserted-by":"publisher","first-page":"567","DOI":"10.1109\/89.326615","volume":"2","author":"JB Allen","year":"1994","unstructured":"Allen, J.B.: How do humans process and recognize speech? IEEE Trans. Speech Audio Proc. 2(4), 567\u2013577 (1994)","journal-title":"IEEE Trans. Speech Audio Proc."},{"key":"71_CR4","doi-asserted-by":"crossref","unstructured":"Baby, D., Gemmeke, J.F., Virtanen, T., Van Hamme, H.: Exemplar-based speech enhancement for deep neural network based automatic speech recognition. In: Proceedings of ICASSP, pp. 4485\u20134489 (2015)","DOI":"10.1109\/ICASSP.2015.7178819"},{"key":"71_CR5","unstructured":"Bouthillier, X., Konda, K., Vincent, P., Memisevic, R.: Dropout as data augmentation. ArXiv e-prints (2015)"},{"key":"71_CR6","doi-asserted-by":"crossref","unstructured":"Chang, S.Y., Morgan, N.: Robust CNN-based speech recognition with Gabor filter kernels. In: Proceedings of Interspeech, pp. 905\u2013909 (2014)","DOI":"10.21437\/Interspeech.2014-226"},{"key":"71_CR7","doi-asserted-by":"publisher","first-page":"185","DOI":"10.1016\/0378-5955(79)90012-1","volume":"1","author":"L Chistovich","year":"1979","unstructured":"Chistovich, L., Lublinskaja, V.: The center of gravity effect in vowel spectra and critical distance between the formants. Hear. Res. 1, 185\u2013195 (1979)","journal-title":"Hear. Res."},{"key":"71_CR8","doi-asserted-by":"crossref","unstructured":"Cui, X., Goel, V., Kingsbury, B.: Data augmentation for deep neural network acoustic modeling. In: Proceedings of ICASSP, pp. 5619\u20135623 (2014)","DOI":"10.1109\/ICASSP.2014.6854671"},{"key":"71_CR9","doi-asserted-by":"crossref","unstructured":"Deng, L., Abdel-Hamid, O., Yu, D.: A deep convolutional neural network using heterogeneous pooling for trading acoustic invariance with phonetic confusion. In: Proceedings of ICASSP, pp. 6669\u20136673 (2013)","DOI":"10.1109\/ICASSP.2013.6638952"},{"key":"71_CR10","doi-asserted-by":"crossref","unstructured":"Flores, J., Young, S.: Continuous speech recognition in noise using spectral subtraction and HMM adaptation. In: Proceedings of ICASSP, pp. 409\u2013412 (1994)","DOI":"10.1109\/ICASSP.1994.389269"},{"issue":"6","key":"71_CR11","doi-asserted-by":"publisher","first-page":"736","DOI":"10.1109\/TASSP.1987.1165223","volume":"35","author":"O Ghitza","year":"1987","unstructured":"Ghitza, O.: Auditory nerve representation criteria for speech analysis\/synthesis. IEEE Trans. ASSP 35(6), 736\u2013740 (1987)","journal-title":"IEEE Trans. ASSP"},{"key":"71_CR12","unstructured":"Graham, B., Reizenstein, J., Robinson, L.: Efficient batchwise dropout training using submatrices. ArXiv e-prints, February 2015"},{"key":"71_CR13","doi-asserted-by":"crossref","unstructured":"Hartmann, W., Ng, T., Hsiao, R., Tsakalidis, S., Schwartz, R.M.: Two-stage data augmentation for low-resourced speech recognition. In: Proceedings of Interspeech, pp. 2378\u20132382 (2016)","DOI":"10.21437\/Interspeech.2016-1386"},{"issue":"6","key":"71_CR14","doi-asserted-by":"publisher","first-page":"4041","DOI":"10.1121\/1.2188369","volume":"119","author":"JM Hillenbrand","year":"2006","unstructured":"Hillenbrand, J.M., Houde, R.A., Gayvert, R.T.: Speech perception based on spectral peaks versus spectral shape. J. Acoust. Soc. Am. 119(6), 4041\u20134054 (2006)","journal-title":"J. Acoust. Soc. Am."},{"key":"71_CR15","unstructured":"Hinton, G., Srivastava, N., Krizhevsky, A., Sutskever, I., Salakhutdinov, R.: Improving neural networks by preventing co-adaptation of feature detectors. CoRR abs\/1207.0580 (2012)"},{"key":"71_CR16","unstructured":"Hirsch, H.G., Pearce, D.: The aurora experimental framework for the performance evaluation of speech recognition systems under noisy conditions. In: ASR2000-Automatic Speech Recognition: Challenges for the New Millenium ISCA Tutorial and Research Workshop (ITRW) (2000)"},{"key":"71_CR17","doi-asserted-by":"crossref","unstructured":"Hsiao, R., Ma, J., Hartmann, W., Karafiat, M., Grezl, F., Burget, L., Szoke, I., Cernocky, J., Watanabe, S., Chen, Z., Mallidi, S., Hermansky, H., Tsakalidis, S., Schwartz, R.: Robust speech recognition in unknown reverberant and noisy conditions. In: Proceedings of ASRU, pp. 533\u2013538. IEEE, December 2015","DOI":"10.1109\/ASRU.2015.7404841"},{"key":"71_CR18","doi-asserted-by":"crossref","unstructured":"Huang, J.T., Li, J., Gong, Y.: An analysis of convolutional neural networks for speech recognition. In: Proceedings of ICASSP, pp. 4989\u20134993 (2015)","DOI":"10.1109\/ICASSP.2015.7178920"},{"key":"71_CR19","doi-asserted-by":"crossref","unstructured":"Ikbal, S., Bourlard, H., Magimai-Doss, M.: Peak location estimation for noise robust speech recognition. In: Proceedings of ICASSP, pp. 453\u2013456 (2005)","DOI":"10.1109\/ICASSP.2005.1415148"},{"key":"71_CR20","unstructured":"Jaitly, N., Hinton, G.E.: Vocal tract length perturbation (VTLP) improves speech recognition. In: ICML (2013)"},{"key":"71_CR21","doi-asserted-by":"crossref","unstructured":"Kanda, N., Takeda, R., Obuchi, Y.: Elastic spectral distortion for low resource speech recognition with deep neural networks. In: Proceedings of ASRU, pp. 309\u2013314. IEEE (2013)","DOI":"10.1109\/ASRU.2013.6707748"},{"key":"71_CR22","doi-asserted-by":"crossref","unstructured":"Ko, T., Peddinti, V., Povey, D., Khudanpur, S.: Audio augmentation for speech recognition. In: Proceedings of Interspeech, pp. 3586\u20133589 (2015)","DOI":"10.21437\/Interspeech.2015-711"},{"key":"71_CR23","doi-asserted-by":"crossref","unstructured":"Ko, T., Peddinti, V., Povey, D., Seltzer, M.L., Khudanpur, S.: A study on data augmentation of reverberant speech for robust speech recognition. In: Proceedings of ICASSP (2017)","DOI":"10.1109\/ICASSP.2017.7953152"},{"issue":"1","key":"71_CR24","doi-asserted-by":"publisher","first-page":"117","DOI":"10.14232\/actacyb.22.1.2015.8","volume":"22","author":"G Kov\u00e1cs","year":"2015","unstructured":"Kov\u00e1cs, G., T\u00f3th, L.: Joint optimization of spectro-temporal features and deep neural nets for robust automatic speech recognition. Acta Cybernetica 22(1), 117\u2013134 (2015)","journal-title":"Acta Cybernetica"},{"key":"71_CR25","doi-asserted-by":"crossref","unstructured":"Lockwood, P., Boudy, J., Blanchet, M.: Non-linear spectral subtraction (NSS) and hidden Markov models for robust speech recognition in car noise environments. In: Proceedings of ICASSP (1992)","DOI":"10.1109\/ICASSP.1992.225921"},{"key":"71_CR26","doi-asserted-by":"crossref","unstructured":"Miao, Y., Metze, F.: Improving low-resource CD-DNN-HMM using dropout and multilingual DNN training. In: Proceedings of Interspeech, pp. 2237\u20132241 (2013)","DOI":"10.21437\/Interspeech.2013-526"},{"key":"71_CR27","doi-asserted-by":"crossref","DOI":"10.1163\/9789004658820","volume-title":"An Introduction to the Psychology of Hearing","author":"BCJ Moore","year":"1997","unstructured":"Moore, B.C.J.: An Introduction to the Psychology of Hearing. Academic Press, London (1997)"},{"key":"71_CR28","doi-asserted-by":"crossref","unstructured":"Peddinti, V., Chen, G., Manohar, V., Ko, T., Povey, D., Khudanpur, S.: JHU aspire system: robust LVCSR with tdnns, ivector adaptation and RNN-LMS. In: Proceedings of ASRU, pp. 539\u2013546 (2015)","DOI":"10.1109\/ASRU.2015.7404842"},{"key":"71_CR29","doi-asserted-by":"crossref","unstructured":"Ragni, A., Knill, K.M., Rath, S.P., Gales, M.J.F.: Data augmentation for low resource languages. In: Proceedings of Interspeech, pp. 810\u2013814. ISCA (2014)","DOI":"10.21437\/Interspeech.2014-207"},{"key":"71_CR30","doi-asserted-by":"crossref","unstructured":"Sainath, T.N., Mohamed, A., Kingsbury, B., Ramabhadran, B.: Deep convolutional neural networks for LVCSR. In: Proceedings of ICASSP, pp. 8614\u20138618 (2013)","DOI":"10.1109\/ICASSP.2013.6639347"},{"issue":"6","key":"71_CR31","doi-asserted-by":"publisher","first-page":"1647","DOI":"10.1121\/1.383662","volume":"66","author":"M Schroeder","year":"1979","unstructured":"Schroeder, M., Atal, B.S., Hall, J.L.: Optimizing digital speech coders by exploiting masking properties of the human ear. JASA 66(6), 1647\u20131652 (1979)","journal-title":"JASA"},{"key":"71_CR32","doi-asserted-by":"publisher","unstructured":"T\u00f3th, L.: Phone recognition with hierarchical convolutional deep maxout networks. EURASIP J. Audio Speech Music Process. 25 (2015). https:\/\/doi.org\/10.1186\/s13636-015-0068-3","DOI":"10.1186\/s13636-015-0068-3"},{"key":"71_CR33","doi-asserted-by":"crossref","unstructured":"Wan, W., Au, O., Keung, C., Yim, C.: A novel approach of low bit-rate speech coding based on sinusoidal representation and auditory model. In: Proceedings of Eurospeech, pp. 1555\u20131558 (1999)","DOI":"10.21437\/Eurospeech.1999-350"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-99579-3_71","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,6]],"date-time":"2025-07-06T14:43:25Z","timestamp":1751813005000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-99579-3_71"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783319995786","9783319995793"],"references-count":33,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-99579-3_71","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2018]]}}}