{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,20]],"date-time":"2025-10-20T10:30:03Z","timestamp":1760956203476,"version":"3.40.3"},"publisher-location":"Cham","reference-count":49,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783031243486"},{"type":"electronic","value":"9783031243493"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-24349-3_8","type":"book-chapter","created":{"date-parts":[[2023,4,3]],"date-time":"2023-04-03T10:09:17Z","timestamp":1680516557000},"page":"123-139","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Transformers in\u00a0Automatic Speech Recognition"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6276-8141","authenticated-orcid":false,"given":"Marc","family":"Evrard","sequence":"first","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,4,4]]},"reference":[{"key":"8_CR1","unstructured":"Auli, M.: Wav2vec: self-supervised learning of speech representations. Talk at MIT, CMU, U of Edinburgh, Spring 2021 (2021)"},{"key":"8_CR2","unstructured":"Baevski, A., Schneider, S., Auli, M.: VQ-wav2vec: self-supervised learning of discrete speech representations. In: International Conference on Learning Representations (2019)"},{"key":"8_CR3","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: Wav2vec 2.0: a framework for self-supervised learning of speech representations. In: Advances in Neural Information Processing Systems, vol. 33, pp. 12449\u201312460 (2020)"},{"key":"8_CR4","doi-asserted-by":"crossref","unstructured":"Bello, I., Zoph, B., Vaswani, A., Shlens, J., Le, Q.V.: Attention augmented convolutional networks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3286\u20133295 (2019)","DOI":"10.1109\/ICCV.2019.00338"},{"key":"8_CR5","doi-asserted-by":"crossref","unstructured":"Chan, W., Jaitly, N., Le, Q., Vinyals, O.: Listen, attend and spell: a neural network for large vocabulary conversational speech recognition. In: 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4960\u20134964. IEEE (2016)","DOI":"10.1109\/ICASSP.2016.7472621"},{"issue":"12","key":"8_CR6","doi-asserted-by":"publisher","first-page":"2041","DOI":"10.1109\/TASLP.2019.2938863","volume":"27","author":"J Chorowski","year":"2019","unstructured":"Chorowski, J., Weiss, R.J., Bengio, S., Van Den Oord, A.: Unsupervised speech representation learning using wavenet autoencoders. IEEE\/ACM Trans. Audio Speech Lang. Process. 27(12), 2041\u20132053 (2019)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"8_CR7","unstructured":"Chung, Y.A., Weng, W.H., Tong, S., Glass, J.: Unsupervised cross-modal alignment of speech and text embedding spaces. In: Advances in Neural Information Processing Systems, vol. 31 (2018)"},{"key":"8_CR8","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, (Long and Short Papers), vol. 1, pp. 4171\u20134186 (2019)"},{"key":"8_CR9","doi-asserted-by":"crossref","unstructured":"Dong, L., Xu, S., Xu, B.: Speech-transformer: a no-recurrence sequence-to-sequence model for speech recognition. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5884\u20135888. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8462506"},{"key":"8_CR10","doi-asserted-by":"crossref","unstructured":"Eloff, R., et al.: Unsupervised acoustic unit discovery for speech synthesis using discrete latent-variable neural networks. In: INTERSPEECH (2019)","DOI":"10.21437\/Interspeech.2019-1518"},{"key":"8_CR11","doi-asserted-by":"crossref","unstructured":"Evain, S., et al.: Lebenchmark: a reproducible framework for assessing self-supervised representation learning from speech. In: INTERSPEECH 2021: Conference of the International Speech Communication Association (2021)","DOI":"10.21437\/Interspeech.2021-556"},{"key":"8_CR12","volume-title":"Acoustic Theory of Speech Production","author":"G Fant","year":"1960","unstructured":"Fant, G.: Acoustic Theory of Speech Production. Mouton & Co., The Hague (1960)"},{"key":"8_CR13","doi-asserted-by":"crossref","unstructured":"Graves, A., Fern\u00e1ndez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd International Conference on Machine Learning, pp. 369\u2013376 (2006)","DOI":"10.1145\/1143844.1143891"},{"key":"8_CR14","unstructured":"Graves, A., Jaitly, N.: Towards end-to-end speech recognition with recurrent neural networks. In: International Conference on Machine Learning, pp. 1764\u20131772. PMLR (2014)"},{"key":"8_CR15","doi-asserted-by":"crossref","unstructured":"Gulati, A., et al.: Conformer: convolution-augmented transformer for speech recognition. In: Proceedings of the Interspeech 2020, pp. 5036\u20135040 (2020)","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"8_CR16","unstructured":"Heafield, K., Pouzyrevsky, I., Clark, J.H., Koehn, P.: Scalable modified Kneser-Ney language model estimation. In: Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), pp. 690\u2013696 (2013)"},{"issue":"6","key":"8_CR17","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1109\/MSP.2012.2205597","volume":"29","author":"G Hinton","year":"2012","unstructured":"Hinton, G., et al.: Deep neural networks for acoustic modeling in speech recognition: the shared views of four research groups. IEEE Signal Process. Mag. 29(6), 82\u201397 (2012)","journal-title":"IEEE Signal Process. Mag."},{"issue":"8","key":"8_CR18","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"8_CR19","doi-asserted-by":"crossref","unstructured":"Hori, T., Watanabe, S., Zhang, Y., Chan, W.: Advances in joint CTC-attention based end-to-end speech recognition with a deep CNN encoder and RNN-LM. In: Proceedings of the Interspeech 2017, pp. 949\u2013953 (2017)","DOI":"10.21437\/Interspeech.2017-1296"},{"key":"8_CR20","doi-asserted-by":"crossref","unstructured":"Jaitly, N., Hinton, G.: Learning a better representation of speech soundwaves using restricted Boltzmann machines. In: 2011 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5884\u20135887. IEEE (2011)","DOI":"10.1109\/ICASSP.2011.5947700"},{"key":"8_CR21","unstructured":"Jang, E., Gu, S., Poole, B.: Categorical reparameterization with gumbel-softmax. In: ICLR 2017 Conference (2016)"},{"key":"8_CR22","doi-asserted-by":"publisher","first-page":"154","DOI":"10.1016\/j.csl.2017.04.008","volume":"46","author":"H Kamper","year":"2017","unstructured":"Kamper, H., Jansen, A., Goldwater, S.: A segmental framework for fully-unsupervised large-vocabulary speech recognition. Comput. Speech Lang. 46, 154\u2013174 (2017)","journal-title":"Comput. Speech Lang."},{"key":"8_CR23","doi-asserted-by":"crossref","unstructured":"Karita, S., Soplin, N.E.Y., Watanabe, S., Delcroix, M., Ogawa, A., Nakatani, T.: Improving transformer-based end-to-end speech recognition with connectionist temporal classification and language model integration. In: Proceedings of the Interspeech 2019, pp. 1408\u20131412 (2019)","DOI":"10.21437\/Interspeech.2019-1938"},{"issue":"3","key":"8_CR24","doi-asserted-by":"publisher","first-page":"400","DOI":"10.1109\/TASSP.1987.1165125","volume":"35","author":"S Katz","year":"1987","unstructured":"Katz, S.: Estimation of probabilities from sparse data for the language model component of a speech recognizer. IEEE Trans. Acoust. Speech Signal Process. 35(3), 400\u2013401 (1987)","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"issue":"10","key":"8_CR25","first-page":"1995","volume":"3361","author":"Y LeCun","year":"1995","unstructured":"LeCun, Y., Bengio, Y., et al.: Convolutional networks for images, speech, and time series. Handb. Brain Theory Neural Netw. 3361(10), 1995 (1995)","journal-title":"Handb. Brain Theory Neural Netw."},{"issue":"11","key":"8_CR26","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y LeCun","year":"1998","unstructured":"LeCun, Y., Bottou, L., Bengio, Y., Haffner, P.: Gradient-based learning applied to document recognition. Proc. IEEE 86(11), 2278\u20132324 (1998)","journal-title":"Proc. IEEE"},{"key":"8_CR27","doi-asserted-by":"crossref","unstructured":"Lee, J., Watanabe, S.: Intermediate loss regularization for CTC-based speech recognition. In: 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), ICASSP 2021, pp. 6224\u20136228. IEEE (2021)","DOI":"10.1109\/ICASSP39728.2021.9414594"},{"key":"8_CR28","doi-asserted-by":"crossref","unstructured":"Likhomanenko, T., Synnaeve, G., Collobert, R.: Who needs words? Lexicon-free speech recognition. In: Proceedings of the Interspeech 2019, pp. 3915\u20133919 (2019)","DOI":"10.21437\/Interspeech.2019-3107"},{"key":"8_CR29","doi-asserted-by":"crossref","unstructured":"Liu, A.T., Yang, S., Chi, P.H., Hsu, P., Lee, H.: Mockingjay: unsupervised speech representation learning with deep bidirectional transformer encoders. In: 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), ICASSP 2020, pp. 6419\u20136423. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9054458"},{"key":"8_CR30","doi-asserted-by":"crossref","unstructured":"Liu, Z., Mao, H., Wu, C.Y., Feichtenhofer, C., Darrell, T., Xie, S.: A convnet for the 2020s. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11976\u201311986 (2022)","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"8_CR31","unstructured":"Lowerre, B.T.: The Harpy Speech Recognition System. Carnegie Mellon University (1976)"},{"key":"8_CR32","doi-asserted-by":"crossref","unstructured":"Mikolov, T., Karafi\u00e1t, M., Burget, L., Cernocky, J., Khudanpur, S.: Recurrent neural network based language model. In: Interspeech, vol. 2, pp. 1045\u20131048. Makuhari (2010)","DOI":"10.21437\/Interspeech.2010-343"},{"key":"8_CR33","unstructured":"Mikolov, T., Sutskever, I., Chen, K., Corrado, G.S., Dean, J.: Distributed representations of words and phrases and their compositionality. In: Advances in Neural Information Processing Systems, vol. 26 (2013)"},{"key":"8_CR34","doi-asserted-by":"crossref","unstructured":"Panayotov, V., Chen, G., Povey, D., Khudanpur, S.: Librispeech: an ASR corpus based on public domain audio books. In: 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5206\u20135210. IEEE (2015)","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"8_CR35","doi-asserted-by":"crossref","unstructured":"Paul, D.B., Baker, J.: The design for the wall street journal-based CSR corpus. In: Speech and Natural Language: Proceedings of a Workshop Held at Harriman, New York, 23\u201326 February 1992 (1992)","DOI":"10.3115\/1075527.1075614"},{"key":"8_CR36","doi-asserted-by":"crossref","unstructured":"Pratap, V., et al.: Wav2letter++: a fast open-source speech recognition system. In: 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), ICASSP 2019, pp. 6460\u20136464. IEEE (2019)","DOI":"10.1109\/ICASSP.2019.8683535"},{"key":"8_CR37","unstructured":"Rabiner, L.R.: Applications of speech recognition in the area of telecommunications. In: 1997 IEEE Workshop on Automatic Speech Recognition and Understanding Proceedings, pp. 501\u2013510. IEEE (1997)"},{"key":"8_CR38","unstructured":"Raghu, M., Unterthiner, T., Kornblith, S., Zhang, C., Dosovitskiy, A.: Do vision transformers see like convolutional neural networks? In: Advances in Neural Information Processing Systems, vol. 34 (2021)"},{"key":"8_CR39","doi-asserted-by":"crossref","unstructured":"Rao, K., Peng, F., Sak, H., Beaufays, F.: Grapheme-to-phoneme conversion using long short-term memory recurrent neural networks. In: 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4225\u20134229. IEEE (2015)","DOI":"10.1109\/ICASSP.2015.7178767"},{"key":"8_CR40","unstructured":"Ringger, E.: A robust loose coupling for speech recognition and natural language understanding. Ph.D. thesis, The University of Rochester (1995)"},{"key":"8_CR41","unstructured":"Saito, S., Itakura, F.: The theoretical consideration of statistically optimum methods for speech spectral density. Electrical Communication Laboratory, NTT, Tokyo, Rep 3107 (1966)"},{"key":"8_CR42","unstructured":"Trockman, A., Kolter, J.Z.: Patches are all you need? arXiv preprint arXiv:2201.09792 (2022)"},{"key":"8_CR43","unstructured":"Van Den Oord, A., Vinyals, O., et al.: Neural discrete representation learning. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"8_CR44","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Proceedings of the 31st International Conference on Neural Information Processing Systems, pp. 6000\u20136010 (2017)"},{"key":"8_CR45","doi-asserted-by":"crossref","unstructured":"Vintsyuk, T.K.: Speech discrimination by dynamic programming. Cybernetics 4(1), 52\u201357 (1968). Russian Kibernetika 4(1):81\u201388 (1968)","DOI":"10.1007\/BF01074755"},{"key":"8_CR46","doi-asserted-by":"crossref","unstructured":"Watanabe, S., et al.: ESPNet: end-to-end speech processing toolkit. In: Proceedings of the Interspeech 2018, pp. 2207\u20132211 (2018)","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"8_CR47","unstructured":"Wu, Z., Liu, Z., Lin, J., Lin, Y., Han, S.: Lite transformer with long-short range attention. In: International Conference on Learning Representations (2019)"},{"key":"8_CR48","doi-asserted-by":"publisher","unstructured":"Xiong, W., et al.: Achieving human parity in conversational speech recognition. IEEE\/ACM Trans. Audio Speech Lang. Process. (2016). https:\/\/doi.org\/10.1109\/TASLP.2017.2756440","DOI":"10.1109\/TASLP.2017.2756440"},{"key":"8_CR49","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Chan, W., Jaitly, N.: Very deep convolutional networks for end-to-end speech recognition. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4845\u20134849. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7953077"}],"container-title":["Lecture Notes in Computer Science","Human-Centered Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-24349-3_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,4,3]],"date-time":"2023-04-03T10:21:18Z","timestamp":1680517278000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-24349-3_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031243486","9783031243493"],"references-count":49,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-24349-3_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"4 April 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ACAI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ECCAI Advanced Course on Artificial Intelligence","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Berlin","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 October 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"acai-agents2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.humane-ai.eu\/event\/acai2021\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"easychair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"21","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"21","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"100% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}