{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T18:21:42Z","timestamp":1775845302213,"version":"3.50.1"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031806063","type":"print"},{"value":"9783031806070","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-80607-0_16","type":"book-chapter","created":{"date-parts":[[2024,12,31]],"date-time":"2024-12-31T16:35:43Z","timestamp":1735662943000},"page":"200-213","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["ASR Systems Under Acoustic Challenges: A Multilingual Study"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-1000-295X","authenticated-orcid":false,"given":"Sergei","family":"Katkov","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2773-4421","authenticated-orcid":false,"given":"Antonio","family":"Liotta","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4166-540X","authenticated-orcid":false,"given":"Alessandro","family":"Vietti","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,1,1]]},"reference":[{"key":"16_CR1","doi-asserted-by":"publisher","unstructured":"Adolfi, F., Bowers, J.S., Poeppel, D.: Successes and critical failures of neural networks in capturing human-like speech recognition. Neural Netw. 162(C), 199\u2013211 (may 2023). https:\/\/doi.org\/10.1016\/j.neunet.2023.02.032","DOI":"10.1016\/j.neunet.2023.02.032"},{"key":"16_CR2","unstructured":"Ardila, R., et al.: Common voice: A massively-multilingual speech corpus. In: International Conference on Language Resources and Evaluation (2019). https:\/\/api.semanticscholar.org\/CorpusID:209376338"},{"key":"16_CR3","unstructured":"Balam, J., Huang, J., Lavrukhin, V., Deng, S., Majumdar, S., Ginsburg, B.: Improving noise robustness of an end-to-end neural model for automatic speech recognition (2020)"},{"key":"16_CR4","unstructured":"Cieri, C., Miller, D., Walker, K.: The fisher corpus: A resource for the next generations of speech-to-text (01 2004)"},{"key":"16_CR5","unstructured":"Cui, T., Xiao, J., Li, L., Jiang, X., Liu, Q.: An approach to improve robustness of nlp systems against asr errors. ArXiv abs\/2103.13610 (2021). https:\/\/api.semanticscholar.org\/CorpusID:232352551"},{"key":"16_CR6","doi-asserted-by":"crossref","unstructured":"Duarte, J.C., Colcher, S.: Building a noisy audio dataset to evaluate machine learning approaches for automatic speech recognition systems. ArXiv abs\/2110.01425 (2018). https:\/\/api.semanticscholar.org\/CorpusID:238259030","DOI":"10.17771\/PUCRio.DImcc.60957"},{"key":"16_CR7","doi-asserted-by":"crossref","unstructured":"Eickhoff, P., M\u00f6ller, M., Pekarek-Rosin, T., Twiefel, J., Wermter, S.: Bring the noise: Introducing noise robustness to pretrained automatic speech recognition. In: International Conference on Artificial Neural Networks (2023). https:\/\/api.semanticscholar.org\/CorpusID:261559431","DOI":"10.1007\/978-3-031-44195-0_31"},{"key":"16_CR8","unstructured":"Frieske, R., Shi, B.E.: Hallucinations in neural automatic speech recognition: Identifying errors and hallucinatory models (2024)"},{"key":"16_CR9","doi-asserted-by":"crossref","unstructured":"Fucci, D., Gaido, M., Negri, M., Cettolo, M., Bentivogli, L.: No pitch left behind: Addressing gender unbalance in automatic speech recognition through pitch manipulation. 2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp.\u00a01\u20138 (2023). https:\/\/api.semanticscholar.org\/CorpusID:263830339","DOI":"10.1109\/ASRU57964.2023.10389767"},{"key":"16_CR10","doi-asserted-by":"publisher","unstructured":"Godfrey, J., Holliman, E., McDaniel, J.: Switchboard: telephone speech corpus for research and development. In: [Proceedings] ICASSP-92: 1992 IEEE International Conference on Acoustics, Speech, and Signal Processing. vol.\u00a01, pp. 517\u2013520 vol.1 (1992). https:\/\/doi.org\/10.1109\/ICASSP.1992.225858","DOI":"10.1109\/ICASSP.1992.225858"},{"key":"16_CR11","unstructured":"Google: Sentencepiece. https:\/\/github.com\/google\/sentencepiece"},{"key":"16_CR12","doi-asserted-by":"publisher","unstructured":"Graves, A., Fern\u00e1ndez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: Labelling unsegmented sequence data with recurrent neural \u2019networks. vol.\u00a02006, pp. 369\u2013376 (01 2006). https:\/\/doi.org\/10.1145\/1143844.1143891","DOI":"10.1145\/1143844.1143891"},{"key":"16_CR13","doi-asserted-by":"crossref","unstructured":"Gulati, A., et al.: Conformer: Convolution-augmented Transformer for Speech Recognition (2020)","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"16_CR14","doi-asserted-by":"publisher","unstructured":"Higuchi, Y., Tawara, N., Ogawa, A., Iwata, T., Kobayashi, T., Ogawa, T.: Noise-robust attention learning for end-to-end speech recognition. In: 2020 28th European Signal Processing Conference (EUSIPCO).,pp. 311\u2013315 (2021). https:\/\/doi.org\/10.23919\/Eusipco47968.2020.9287488","DOI":"10.23919\/Eusipco47968.2020.9287488"},{"key":"16_CR15","unstructured":"Holtzman, A., Buys, J., Forbes, M., Choi, Y.: The curious case of neural text degeneration. CoRR abs\/1904.09751 (2019). http:\/\/arxiv.org\/abs\/1904.09751"},{"key":"16_CR16","unstructured":"Huang, J., et al.: Cross-language transfer learning, continuous learning, and domain adaptation for end-to-end automatic speech recognition (2020)"},{"key":"16_CR17","doi-asserted-by":"crossref","unstructured":"Katkov, S., Liotta, A., Vietti, A.: Benchmarking whisper under diverse audio transformations and real-time constraints. In: Proceedings of the 26th International Conference on Speech and Computer (SPECOM) (2024)","DOI":"10.1007\/978-3-031-77961-9_6"},{"key":"16_CR18","doi-asserted-by":"crossref","unstructured":"Katkov, S., Liotta, A., Vietti, A.: Evaluating the robustness of ASR systems in adverse acoustic conditions. In: Proceedings of the Fifth International Conference on Intelligent Data Science Technologies and Applications (IDSTA) (2024)","DOI":"10.1109\/IDSTA62194.2024.10746999"},{"key":"16_CR19","doi-asserted-by":"publisher","unstructured":"Kriman, S., et al.: Quartznet: Deep automatic speech recognition with 1d time-channel separable convolutions. In: ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6124\u20136128 (2020). https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053889","DOI":"10.1109\/ICASSP40776.2020.9053889"},{"key":"16_CR20","doi-asserted-by":"crossref","unstructured":"Lee, G.W., Kim, H.K.: Two-step joint optimization with auxiliary loss function for noise-robust speech recognition. Sensors (Basel, Switzerland) 22 (2022). https:\/\/api.semanticscholar.org\/CorpusID:250942334","DOI":"10.3390\/s22145381"},{"key":"16_CR21","doi-asserted-by":"crossref","unstructured":"Li, J., Deng, L., Gong, Y., H\u00e4b-Umbach, R.: An overview of noise-robust automatic speech recognition. IEEE\/ACM Transactions on Audio, Speech, and Language Processing 22, 745\u2013777 (2014), https:\/\/api.semanticscholar.org\/CorpusID:14557362","DOI":"10.1109\/TASLP.2014.2304637"},{"key":"16_CR22","unstructured":"Mauch, M., Ewert, S.: The audio degradation toolbox and its application to robustness evaluation. In: International Society for Music Information Retrieval Conference (2013). https:\/\/api.semanticscholar.org\/CorpusID:11675708"},{"key":"16_CR23","doi-asserted-by":"publisher","unstructured":"M\u00fcller, J.A., Wendt, D., Kollmeier, B., Debener, S., Brand, T.: Effect of speech rate on neural tracking of speech. Front. Psychol. 10 (2019). https:\/\/doi.org\/10.3389\/fpsyg.2019.00449, https:\/\/www.frontiersin.org\/journals\/psychology\/articles\/10.3389\/fpsyg.2019.00449","DOI":"10.3389\/fpsyg.2019.00449"},{"key":"16_CR24","doi-asserted-by":"publisher","unstructured":"Panayotov, V., Chen, G., Povey, D., Khudanpur, S.: Librispeech: An asr corpus based on public domain audio books. In: 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5206\u20135210 (2015). https:\/\/doi.org\/10.1109\/ICASSP.2015.7178964","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"16_CR25","doi-asserted-by":"publisher","unstructured":"Paul, D.B., Baker, J.M.: The design for the wall street journal-based csr corpus. In: Proceedings of the Workshop on Speech and Natural Language. p. 357\u2013362. HLT \u201991, Association for Computational Linguistics, USA (1992). https:\/\/doi.org\/10.3115\/1075527.1075614","DOI":"10.3115\/1075527.1075614"},{"key":"16_CR26","doi-asserted-by":"publisher","unstructured":"Payton, K.L., Uchanski, R.M., Braida, L.D.: Intelligibility of conversational and clear speech in noise and reverberation for listeners with normal and impaired hearing. J. Acoust. Society America 95(3), 1581\u20131592 (03 1994). https:\/\/doi.org\/10.1121\/1.408545","DOI":"10.1121\/1.408545"},{"key":"16_CR27","doi-asserted-by":"publisher","unstructured":"Pratap, V., Xu, Q., Sriram, A., Synnaeve, G., Collobert, R.: Mls: A large-scale multilingual dataset for speech research. In: Interspeech 2020. ISCA (Oct 2020). https:\/\/doi.org\/10.21437\/interspeech.2020-2826","DOI":"10.21437\/interspeech.2020-2826"},{"key":"16_CR28","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision (2022)"},{"key":"16_CR29","doi-asserted-by":"crossref","unstructured":"Rekesh, D., et al.: Fast conformer with linearly scalable attention for efficient speech recognition (2023)","DOI":"10.1109\/ASRU57964.2023.10389701"},{"key":"16_CR30","unstructured":"Saito, K., et al.: Unsupervised vocal dereverberation with diffusion-based generative models (2022)"},{"key":"16_CR31","doi-asserted-by":"crossref","unstructured":"Schwartz, B., Gannot, S., Habets, E.: Online speech dereverberation using kalman filter and em algorithm. IEEE\/ACM Trans. Audio, Speech, Lang. Process. 23, 394\u2013406 (2015). https:\/\/api.semanticscholar.org\/CorpusID:2413399","DOI":"10.1109\/TASLP.2014.2372342"},{"key":"16_CR32","doi-asserted-by":"publisher","first-page":"336","DOI":"10.1007\/978-3-642-16327-2_40","volume-title":"Intelligent Information Processing V","author":"U Shrawankar","year":"2010","unstructured":"Shrawankar, U., Thakare, V.: Noise estimation and noise removal techniques for speech recognition in adverse environment. In: Shi, Z., Vadera, S., Aamodt, A., Leake, D. (eds.) Intelligent Information Processing V, pp. 336\u2013342. Springer Berlin Heidelberg, Berlin, Heidelberg (2010). https:\/\/doi.org\/10.1007\/978-3-642-16327-2_40"},{"key":"16_CR33","unstructured":"Wang, C., et al.: Voxpopuli: A large-scale multilingual speech corpus for representation learning, semi-supervised learning and interpretation. CoRR abs\/2101.00390 (2021). https:\/\/arxiv.org\/abs\/2101.00390"},{"key":"16_CR34","doi-asserted-by":"publisher","unstructured":"Zhang, Q., Lu, H., Sak, H., Tripathi, A., McDermott, E., Koo, S., Kumar, S.: Transformer transducer: A streamable speech recognition model with transformer encoders and rnn-t loss. In: ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 7829\u20137833 (2020). https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053896","DOI":"10.1109\/ICASSP40776.2020.9053896"}],"container-title":["Lecture Notes in Computer Science","AIxIA 2024 \u2013 Advances in Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-80607-0_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,31]],"date-time":"2024-12-31T17:04:44Z","timestamp":1735664684000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-80607-0_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031806063","9783031806070"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-80607-0_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"1 January 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"AIxIA","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference of the Italian Association for Artificial Intelligence","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Bolzano","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24 November 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 November 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"aiia2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}