{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T18:52:39Z","timestamp":1777143159864,"version":"3.51.4"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030339036","type":"print"},{"value":"9783030339043","type":"electronic"}],"license":[{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019]]},"DOI":"10.1007\/978-3-030-33904-3_63","type":"book-chapter","created":{"date-parts":[[2019,10,25]],"date-time":"2019-10-25T22:40:05Z","timestamp":1572043205000},"page":"669-678","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["A Survey of the Effects of Data Augmentation for Automatic Speech Recognition Systems"],"prefix":"10.1007","author":[{"given":"Jose Manuel","family":"Ramirez","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ana","family":"Montalvo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jose Ramon","family":"Calvo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2019,10,22]]},"reference":[{"key":"63_CR1","doi-asserted-by":"crossref","unstructured":"Chan, W., Jaitly, N., Le, Q.V., Vinyals, O.: Listen, attend and spell: a neural network for large vocabulary conversational speech recognition. In: ICASSP (2016). http:\/\/williamchan.ca\/papers\/wchan-icassp-2016.pdf","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"63_CR2","doi-asserted-by":"publisher","unstructured":"Chiu, C.C., et al.: State-of-the-art speech recognition with sequence-to-sequence models. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), April 2018. https:\/\/doi.org\/10.1109\/icassp.2018.8462105","DOI":"10.1109\/icassp.2018.8462105"},{"key":"63_CR3","doi-asserted-by":"publisher","unstructured":"Cui, X., Goel, V., Kingsbury, B.: Data augmentation for deep neural network acoustic modeling. In: 2014 IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2014, vol. 23, pp. 5582\u20135586, May 2014. https:\/\/doi.org\/10.1109\/ICASSP.2014.6854671","DOI":"10.1109\/ICASSP.2014.6854671"},{"key":"63_CR4","doi-asserted-by":"publisher","first-page":"65","DOI":"10.1016\/B978-0-08-051584-7.50010-3","volume-title":"Readings in Speech Recognition","author":"STEVEN B. DAVIS","year":"1990","unstructured":"Davis, S.B., Mermelstein, P.: Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences. In: Readings in Speech Recognition, pp. 65\u201374. Morgan Kaufmann Publishers Inc., San Francisco (1990). http:\/\/dl.acm.org\/citation.cfm?id=108235.108239"},{"key":"63_CR5","unstructured":"DeVries, T., Taylor, G.W.: Improved regularization of convolutional neural networks with cutout (2017)"},{"key":"63_CR6","unstructured":"Evermann, G., Woodland, P.: Large vocabulary decoding and confidence estimation using word posterior probabilities (2000)"},{"key":"63_CR7","doi-asserted-by":"publisher","unstructured":"Gales, M.J.F., Ragni, A., AlDamarki, H., Gautier, C.: Support vector machines for noise robust ASR. In: 2009 IEEE Workshop on Automatic Speech Recognition Understanding, pp. 205\u2013210, November 2009. https:\/\/doi.org\/10.1109\/ASRU.2009.5372913","DOI":"10.1109\/ASRU.2009.5372913"},{"key":"63_CR8","unstructured":"Graves, A., Jaitly, N.: Towards end-to-end speech recognition with recurrent neural networks. In: Proceedings of the 31st International Conference on International Conference on Machine Learning, ICML 2014, vol. 32, pp. II-1764\u2013II-1772. JMLR.org (2014). http:\/\/dl.acm.org\/citation.cfm?id=3044805.3045089"},{"key":"63_CR9","unstructured":"Hannun, A., et al.: Deep speech: scaling up end-to-end speech recognition (2014)"},{"key":"63_CR10","unstructured":"Van den Heuvel, H., Choukri, K., Gollan, C., Moreno, A., Mostefa, D.: TC-STAR: new language resources for ASR and SLT purposes. In: LREC, pp. 2570\u20132573 (2006)"},{"key":"63_CR11","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1109\/MSP.2012.2205597","volume":"29","author":"G Hinton","year":"2012","unstructured":"Hinton, G., et al.: Deep neural networks for acoustic modeling in speech recognition. Sig. Process. Mag. 29, 82\u201397 (2012)","journal-title":"Sig. Process. Mag."},{"key":"63_CR12","unstructured":"Hirsch, H.G.: Fant-filtering and noise adding tool. Niederrhein Univ. Appl. Sci. (2005). http:\/\/dnt.kr.hsnr.de\/download.html"},{"key":"63_CR13","doi-asserted-by":"crossref","unstructured":"Hori, T., Watanabe, S., Zhang, Y., Chan, W.: Advances in joint CTC-attention based end-to-end speech recognition with a deep CNN encoder and RNN-LM. CoRR abs\/1706.02737 (2017). http:\/\/arxiv.org\/abs\/1706.02737","DOI":"10.21437\/Interspeech.2017-1296"},{"key":"63_CR14","unstructured":"Jaitly, N., Hinton, E.: Vocal tract length perturbation (VTLP) improves speech recognition. In: Proceedings of the 30th International Conference on Machine Learning (2013)"},{"key":"63_CR15","doi-asserted-by":"publisher","unstructured":"Kajarekar, S.S., Yegnanarayana, B., Hermansky, H.: A study of two dimensional linear discriminants for ASR. In: Proceedings of the 2001 IEEE International Conference on Acoustics, Speech, and Signal Processing (Cat. No. 01CH37221), vol. 1, pp. 137\u2013140, May 2001. https:\/\/doi.org\/10.1109\/ICASSP.2001.940786","DOI":"10.1109\/ICASSP.2001.940786"},{"key":"63_CR16","doi-asserted-by":"publisher","unstructured":"Kanda, N., Takeda, R., Obuchi, Y.: Elastic spectral distortion for low resource speech recognition with deep neural networks. In: 2013 IEEE Workshop on Automatic Speech Recognition and Understanding, Olomouc, Czech Republic, 8\u201312 December 2013, pp. 309\u2013314 (2013). https:\/\/doi.org\/10.1109\/ASRU.2013.6707748","DOI":"10.1109\/ASRU.2013.6707748"},{"key":"63_CR17","doi-asserted-by":"crossref","unstructured":"Kim, C., et al.: Generation of large-scale simulated utterances in virtual rooms to train deep-neural networks for far-field speech recognition in Google home, pp. 379\u2013383 (2017). http:\/\/www.isca-speech.org\/archive\/Interspeech_2017\/pdfs\/1510.PDF","DOI":"10.21437\/Interspeech.2017-1510"},{"key":"63_CR18","doi-asserted-by":"crossref","unstructured":"Ko, T., Peddinti, V., Povey, D., Khudanpur, S.: Audio augmentation for speech recognition. In: INTERSPEECH (2015)","DOI":"10.21437\/Interspeech.2015-711"},{"key":"63_CR19","doi-asserted-by":"publisher","first-page":"115","DOI":"10.1006\/csla.2001.0186","volume":"16","author":"L Lamel","year":"2002","unstructured":"Lamel, L., Gauvain, J.L., Adda, G.: Lightly supervised and unsupervised acoustic model training. Comput. Speech Lang. 16, 115\u2013129 (2002). https:\/\/doi.org\/10.1006\/csla.2001.0186","journal-title":"Comput. Speech Lang."},{"key":"63_CR20","doi-asserted-by":"crossref","unstructured":"Maas, A.L., Xie, Z., Jurafsky, D., Ng, A.Y.: Lexicon-free conversational speech recognition with neural networks. In: Proceedings of the North American Chapter of the Association for Computational Linguistics (NAACL) (2015)","DOI":"10.3115\/v1\/N15-1038"},{"key":"63_CR21","doi-asserted-by":"crossref","unstructured":"Miao, Y., Gowayyed, M., Metze, F.: EESEN: end-to-end speech recognition using deep RNN models and WFST-based decoding. CoRR abs\/1507.08240 (2015). http:\/\/arxiv.org\/abs\/1507.08240","DOI":"10.1109\/ASRU.2015.7404790"},{"key":"63_CR22","series-title":"Springer Handbooks","doi-asserted-by":"publisher","first-page":"559","DOI":"10.1007\/978-3-540-49127-9_28","volume-title":"Springer Handbook of Speech Processing","author":"M Mohri","year":"2008","unstructured":"Mohri, M., Pereira, F., Riley, M.: Speech recognition with weighted finite-state transducers. In: Benesty, J., Sondhi, M.M., Huang, Y.A. (eds.) Springer Handbook of Speech Processing. SH, pp. 559\u2013584. Springer, Heidelberg (2008). https:\/\/doi.org\/10.1007\/978-3-540-49127-9_28"},{"key":"63_CR23","doi-asserted-by":"crossref","unstructured":"Park, D.S., et al.: SpecAugment: a simple data augmentation method for automatic speech recognition. CoRR abs\/1904.08779 (2019). http:\/\/arxiv.org\/abs\/1904.08779","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"63_CR24","doi-asserted-by":"crossref","unstructured":"Peddinti, V., Chen, G., Povey, D., Khudanpur, S.: Reverberation robust acoustic modeling using i-vectors with time delay neural networks. In: INTERSPEECH (2015)","DOI":"10.21437\/Interspeech.2015-527"},{"key":"63_CR25","unstructured":"Povey, D.: A tutorial-style introduction to subspace Gaussian mixture models for speech recognition. Microsoft Research, Redmond, WA (2009)"},{"key":"63_CR26","unstructured":"Povey, D., et al.: The Kaldi speech recognition toolkit. Technical report, IEEE Signal Processing Society (2011)"},{"issue":"1","key":"63_CR27","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1016\/j.csl.2011.04.002","volume":"26","author":"D Povey","year":"2012","unstructured":"Povey, D., Yao, K.: A basis representation of constrained MLLR transforms for robust adaptation. Comput. Speech Lang. 26(1), 35\u201351 (2012)","journal-title":"Comput. Speech Lang."},{"key":"63_CR28","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"431","DOI":"10.1007\/978-3-540-74628-7_56","volume-title":"Text, Speech and Dialogue","author":"JV Psutka","year":"2007","unstructured":"Psutka, J.V.: Benefit of maximum likelihood linear transform (MLLT) used at different levels of covariance matrices clustering in ASR systems. In: Matou\u0161ek, V., Mautner, P. (eds.) TSD 2007. LNCS (LNAI), vol. 4629, pp. 431\u2013438. Springer, Heidelberg (2007). https:\/\/doi.org\/10.1007\/978-3-540-74628-7_56"},{"key":"63_CR29","doi-asserted-by":"crossref","unstructured":"Ragni, A., Knill, K.M., Rath, S.P., Gales, M.J.: Data augmentation for low resource languages (2014)","DOI":"10.21437\/Interspeech.2014-207"},{"key":"63_CR30","doi-asserted-by":"crossref","unstructured":"Thiemann, J., Ito, N., Vincent, E.: The diverse environments multi-channel acoustic noise database (DEMAND): a database of multichannel environmental noise recordings. In: Proceedings of Meetings on Acoustics, ICA 2013, vol. 19, p. 035081. ASA (2013)","DOI":"10.1121\/1.4799597"},{"key":"63_CR31","doi-asserted-by":"publisher","unstructured":"Wang, L., Gales, M.J.F., Woodland, P.C.: Unsupervised training for mandarin broadcast news and conversation transcription. In: 2007 IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2007, vol. 4, pp. IV-353\u2013IV-356, April 2007. https:\/\/doi.org\/10.1109\/ICASSP.2007.366922","DOI":"10.1109\/ICASSP.2007.366922"},{"key":"63_CR32","doi-asserted-by":"publisher","unstructured":"Wang, L., Woodland, P.C.: Discriminative adaptive training using the MPE criterion. In: 2003 IEEE Workshop on Automatic Speech Recognition and Understanding (IEEE Cat. No. 03EX721), pp. 279\u2013284, November 2003. https:\/\/doi.org\/10.1109\/ASRU.2003.1318454","DOI":"10.1109\/ASRU.2003.1318454"},{"key":"63_CR33","series-title":"Springer Handbooks","doi-asserted-by":"publisher","first-page":"539","DOI":"10.1007\/978-3-540-49127-9_27","volume-title":"Springer Handbook of Speech Processing","author":"S Young","year":"2008","unstructured":"Young, S.: HMMs and related speech recognition technologies. In: Benesty, J., Sondhi, M.M., Huang, Y.A. (eds.) Springer Handbook of Speech Processing. SH, pp. 539\u2013558. Springer, Heidelberg (2008). https:\/\/doi.org\/10.1007\/978-3-540-49127-9_27"},{"key":"63_CR34","doi-asserted-by":"crossref","unstructured":"Zavaliagkos, G., Colthurst, T.: Utilizing untranscribed training data to improve performance. In: DARPA Broadcast News Transcription and Understanding Workshop, Landsdowne, pp. 301\u2013305 (1998)","DOI":"10.21437\/ICSLP.1998-679"}],"container-title":["Lecture Notes in Computer Science","Progress in Pattern Recognition, Image Analysis, Computer Vision, and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-33904-3_63","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,12]],"date-time":"2024-03-12T14:12:41Z","timestamp":1710252761000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-33904-3_63"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019]]},"ISBN":["9783030339036","9783030339043"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-33904-3_63","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019]]},"assertion":[{"value":"22 October 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"CIARP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Iberoamerican Congress on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Havana","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Cuba","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2019","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 October 2019","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31 October 2019","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ciarp2019","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/ciarp.uci.cu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"OCS","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"128","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"70","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"55% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}