{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,22]],"date-time":"2025-03-22T12:12:19Z","timestamp":1742645539585},"reference-count":29,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"4","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2023,4,1]]},"DOI":"10.1587\/transinf.2022edp7151","type":"journal-article","created":{"date-parts":[[2023,3,31]],"date-time":"2023-03-31T22:27:46Z","timestamp":1680301666000},"page":"538-544","source":"Crossref","is-referenced-by-count":5,"title":["Speech Recognition for Air Traffic Control via Feature Learning and End-to-End Training"],"prefix":"10.1587","volume":"E106.D","author":[{"given":"Peng","family":"FAN","sequence":"first","affiliation":[{"name":"National Key Laboratory of Fundamental Science on Synthetic Vision, Sichuan University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiyao","family":"HUA","sequence":"additional","affiliation":[{"name":"College of Computer Science, Sichuan University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yi","family":"LIN","sequence":"additional","affiliation":[{"name":"College of Computer Science, Sichuan University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bo","family":"YANG","sequence":"additional","affiliation":[{"name":"College of Computer Science, Sichuan University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jianwei","family":"ZHANG","sequence":"additional","affiliation":[{"name":"College of Computer Science, Sichuan University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenyi","family":"GE","sequence":"additional","affiliation":[{"name":"College of Computer Science, Chengdu University of Information Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dongyue","family":"GUO","sequence":"additional","affiliation":[{"name":"National Key Laboratory of Fundamental Science on Synthetic Vision, Sichuan University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"532","reference":[{"key":"1","doi-asserted-by":"publisher","unstructured":"[1] M. El Ayadi, M.S. Kamel, and F. Karray, \u201cSurvey on speech emotion recognition: Features, classification schemes, and databases,\u201d Pattern Recognition, vol.44, no.3, pp.572-587, 2011. 10.1016\/j.patcog.2010.09.020","DOI":"10.1016\/j.patcog.2010.09.020"},{"key":"2","unstructured":"[2] C.M. Geac\u0103r, \u201cReducing pilot\/ATC communication errors using voice recognition,\u201d Proc. ICAS, 2010."},{"key":"3","doi-asserted-by":"publisher","unstructured":"[3] Y. Lin, D. Guo, J. Zhang, Z. Chen, and B. Yang, \u201cA unified framework for multilingual speech recognition in air traffic control systems,\u201d IEEE Trans. Neural Netw. Learn. Syst., vol.32, no.8, pp.3608-3620, 2021. 10.1109\/TNNLS.2020.3015830","DOI":"10.1109\/TNNLS.2020.3015830"},{"key":"4","doi-asserted-by":"publisher","unstructured":"[4] Y. Lin, L. Deng, Z. Chen, X. Wu, J. Zhang, and B. Yang, \u201cA real-time ATC safety monitoring framework using a deep learning approach,\u201d IEEE Trans. Intell. Transp. Syst., vol.21, no.11, pp.4572-4581, 2020. 10.1109\/tits.2019.2940992","DOI":"10.1109\/TITS.2019.2940992"},{"key":"5","unstructured":"[5] M. Ravanelli and Y. Bengio, \u201cInterpretable convolutional filters with SincNet,\u201d arXiv preprint arXiv:1811.09725, 2018. 10.48550\/arXiv.1811.09725"},{"key":"6","doi-asserted-by":"crossref","unstructured":"[6] S. Kim, T. Hori, and S. Watanabe, \u201cJoint CTC-attention based end-to-end speech recognition using multi-task learning,\u201d 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.4835-4839, IEEE, 2017. 10.1109\/icassp.2017.7953075","DOI":"10.1109\/ICASSP.2017.7953075"},{"key":"7","doi-asserted-by":"publisher","unstructured":"[7] Y. Lin, \u201cSpoken instruction understanding in air traffic control: Challenge, technique, and application,\u201d Aerospace, vol.8, no.3, 65, 2021. 10.3390\/aerospace8030065","DOI":"10.3390\/aerospace8030065"},{"key":"8","doi-asserted-by":"crossref","unstructured":"[8] S. Schneider, A. Baevski, R. Collobert, and M. Auli, \u201cwav2vec: Unsupervised pre-training for speech recognition,\u201d arXiv preprint arXiv:1904.05862, 2019. 10.48550\/arXiv.1904.05862","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"9","doi-asserted-by":"crossref","unstructured":"[9] T.N. Sainath, O. Vinyals, A. Senior, and H. Sak, \u201cConvolutional, long short-term memory, fully connected deep neural networks,\u201d 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.4580-4584, IEEE, 2015. 10.1109\/icassp.2015.7178838","DOI":"10.1109\/ICASSP.2015.7178838"},{"key":"10","doi-asserted-by":"crossref","unstructured":"[10] H. Soltau, H. Liao, and H. Sak, \u201cNeural speech recognizer: Acoustic-to-word LSTM model for large vocabulary speech recognition,\u201d arXiv preprint arXiv:1610.09975, 2016. 10.48550\/arXiv.1610.09975","DOI":"10.21437\/Interspeech.2017-1566"},{"key":"11","doi-asserted-by":"crossref","unstructured":"[11] M. Ravanelli and Y. Bengio, \u201cSpeech and speaker recognition from raw waveform with SincNet,\u201d arXiv preprint arXiv:1812.05920, 2018. 10.48550\/arXiv.1812.05920","DOI":"10.1109\/SLT.2018.8639585"},{"key":"12","doi-asserted-by":"crossref","unstructured":"[12] T. Parcollet, M. Morchid, and G. Linar\u00e8s, \u201cE2E-SincNet: Toward fully end-to-end speech recognition,\u201d ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.7714-7718, IEEE, 2020. 10.1109\/icassp40776.2020.9053954","DOI":"10.1109\/ICASSP40776.2020.9053954"},{"key":"13","doi-asserted-by":"crossref","unstructured":"[13] L. K\u00fcrzinger, N. Lindae, P. Klewitz, and G. Rigoll, \u201cLightweight end-to-end speech recognition from raw audio data using sinc-convolutions,\u201d arXiv preprint arXiv:2010.07597, 2020. 10.48550\/arXiv.2010.07597","DOI":"10.21437\/Interspeech.2020-1392"},{"key":"14","doi-asserted-by":"publisher","unstructured":"[14] C. Yi, S. Zhou, and B. Xu, \u201cEfficiently fusing pretrained acoustic and linguistic encoders for low-resource speech recognition,\u201d IEEE Signal Process. Lett., vol.28, pp.788-792, 2021. 10.1109\/lsp.2021.3071668","DOI":"10.1109\/LSP.2021.3071668"},{"key":"15","doi-asserted-by":"crossref","unstructured":"[15] S. Yadav and N. Zeghidour, \u201cLearning neural audio features without supervision,\u201d Proc. Interspeech 2022, pp.396-400, 2022. 10.21437\/interspeech.2022-10834","DOI":"10.21437\/Interspeech.2022-10834"},{"key":"16","unstructured":"[16] N. Zeghidour, O. Teboul, F. de Chaumont Quitry, and M. Tagliasacchi, \u201cLEAF: A learnable frontend for audio classification,\u201d arXiv preprint arXiv:2101.08596, 2021. 10.48550\/arXiv.2101.08596"},{"key":"17","doi-asserted-by":"crossref","unstructured":"[17] Z. Yue, E. Loweimi, H. Christensen, J. Barker, and Z. Cvetkovic, \u201cDysarthric speech recognition from raw waveform with parametric CNNs,\u201d Proc. Interspeech 2022, pp.31-35, 2022. 10.21437\/interspeech.2022-163","DOI":"10.21437\/Interspeech.2022-163"},{"key":"18","doi-asserted-by":"crossref","unstructured":"[18] Z. Ma, Y. Qiu, F. Hou, R. Wang, J.T.W. Chu, and C. Bullen, \u201cDetermining the best acoustic features for smoker identification,\u201d ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.8177-8181, 2022. 10.1109\/icassp43922.2022.9747712","DOI":"10.1109\/ICASSP43922.2022.9747712"},{"key":"19","unstructured":"[19] Z.G. Juan, P. Motlicek, Q. Zhan, R. Braun, and K. Vesely, \u201cAutomatic speech recognition benchmark for air-traffic communications,\u201d Tech. Rep., ISCA, 2020."},{"key":"20","doi-asserted-by":"crossref","unstructured":"[20] J. Zuluaga-Gomez, I. Nigmatulina, A. Prasad, P. Motlicek, K. Vesel\u1ef3, M. Kocour, and I. Sz\u00f6ke, \u201cContextual semi-supervised learning: An approach to leverage air-surveillance and untranscribed ATC data in ASR systems,\u201d arXiv preprint arXiv:2104.03643, 2021. 10.48550\/arXiv.2104.03643","DOI":"10.21437\/Interspeech.2021-1373"},{"key":"21","doi-asserted-by":"publisher","unstructured":"[21] Y. Lin, Q. Li, B. Yang, Z. Yan, H. Tan, and Z. Chen, \u201cImproving speech recognition models with small samples for air traffic control systems,\u201d Neurocomputing, vol.445, pp.287-297, 2021. 10.1016\/j.neucom.2020.08.092","DOI":"10.1016\/j.neucom.2020.08.092"},{"key":"22","doi-asserted-by":"publisher","unstructured":"[22] D. Guo, Z. Zhang, P. Fan, J. Zhang, and B. Yang, \u201cA context-aware language model to improve the speech recognition in air traffic control,\u201d Aerospace, vol.8, no.11, 348, 2021. 10.3390\/aerospace8110348","DOI":"10.3390\/aerospace8110348"},{"key":"23","doi-asserted-by":"publisher","unstructured":"[23] D. Guo, J. Zhang, B. Yang, and Y. Lin, \u201cA comparative study of speaker role identification in air traffic communication using deep learning approaches,\u201d ACM Trans. Asian Low-Resour. Lang. Inf. Process., accepted. 10.1145\/3572792","DOI":"10.1145\/3572792"},{"key":"24","unstructured":"[24] D. Amodei, S. Ananthanarayanan, R. Anubhai, J. Bai, E. Battenberg, C. Case, J. Casper, B. Catanzaro, Q. Cheng, G. Chen, et al., \u201cDeep speech 2: End-to-end speech recognition in English and Mandarin,\u201d International Conference on Machine Learning, pp.173-182, PMLR, 2016."},{"key":"25","doi-asserted-by":"crossref","unstructured":"[25] A. Graves, S. Fern\u00e1ndez, F. Gomez, and J. Schmidhuber, \u201cConnectionist temporal classification: Labelling unsegmented sequence data with recurrent neural networks,\u201d Proc. 23rd International Conference on Machine Learning, pp.369-376, 2006. 10.1145\/1143844.1143891","DOI":"10.1145\/1143844.1143891"},{"key":"26","doi-asserted-by":"crossref","unstructured":"[26] B. Yang, X. Tan, Z. Chen, B. Wang, D. Li, Z. Yang, X. Wu, and Y. Lin, \u201cATCspeech: A multilingual pilot-controller speech corpus from real air traffic control environment,\u201d arXiv preprint arXiv:1911.11365, 2019. 10.48550\/arXiv.1911.11365","DOI":"10.21437\/Interspeech.2020-1020"},{"key":"27","doi-asserted-by":"crossref","unstructured":"[27] J. Li, V. Lavrukhin, B. Ginsburg, R. Leary, O. Kuchaiev, J.M. Cohen, H. Nguyen, and R.T. Gadde, \u201cJasper: An end-to-end convolutional neural acoustic model,\u201d arXiv preprint arXiv:1904.03288, 2019. 10.48550\/arXiv.1904.03288","DOI":"10.21437\/Interspeech.2019-1819"},{"key":"28","doi-asserted-by":"crossref","unstructured":"[28] V. Pratap, A. Hannun, Q. Xu, J. Cai, J. Kahn, G. Synnaeve, V. Liptchinsky, and R. Collobert, \u201cWav2letter++: A fast open-source speech recognition system,\u201d ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.6460-6464, IEEE, 2019. 10.1109\/icassp.2019.8683535","DOI":"10.1109\/ICASSP.2019.8683535"},{"key":"29","doi-asserted-by":"crossref","unstructured":"[29] A. Gulati, J. Qin, C.-C. Chiu, N. Parmar, Y. Zhang, J. Yu, W. Han, S. Wang, Z. Zhang, Y. Wu, and R. Pang, \u201cConformer: Convolution-augmented transformer for speech recognition,\u201d arXiv preprint arXiv:2005.08100, 2020. 10.48550\/arXiv.2005.08100","DOI":"10.21437\/Interspeech.2020-3015"}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E106.D\/4\/E106.D_2022EDP7151\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,4,1]],"date-time":"2023-04-01T04:31:34Z","timestamp":1680323494000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E106.D\/4\/E106.D_2022EDP7151\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,4,1]]},"references-count":29,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2023]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2022edp7151","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"value":"0916-8532","type":"print"},{"value":"1745-1361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,4,1]]},"article-number":"2022EDP7151"}}