{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T19:15:01Z","timestamp":1774034101780,"version":"3.50.1"},"reference-count":48,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2023,6,26]],"date-time":"2023-06-26T00:00:00Z","timestamp":1687737600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,6,26]],"date-time":"2023-06-26T00:00:00Z","timestamp":1687737600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2023,7]]},"DOI":"10.1007\/s10772-023-10034-z","type":"journal-article","created":{"date-parts":[[2023,6,26]],"date-time":"2023-06-26T22:01:36Z","timestamp":1687816896000},"page":"531-539","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["A transformer-based network for speech recognition"],"prefix":"10.1007","volume":"26","author":[{"given":"Lina","family":"Tang","sequence":"first","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,6,26]]},"reference":[{"key":"10034_CR1","unstructured":"Al-Taai, R. Y. L., Xiaojun, W., & Zhu, Y. (2020). Targeted voice enhancement by bandpass filter and composite deep denoising autoencoder. In: 14th international conference on signal processing and communication systems (ICSPCS) (pp. 1\u20136). SA, Australia: Adelaide."},{"key":"10034_CR2","unstructured":"Ba, J. L., Kiros, J. R., & Hinton, G. E. (2016). Layer normalization. arXiv:1607.06450"},{"key":"10034_CR3","doi-asserted-by":"crossref","unstructured":"Bai, Y., Yi, J., Tao, J., et\u00a0al. (2020). Listen attentively, and spell once: Whole sentence generation via a non-autoregressive architecture for low-latency speech recognition. arXiv:abs\/2005.04862v4","DOI":"10.21437\/Interspeech.2020-1600"},{"key":"10034_CR4","doi-asserted-by":"crossref","unstructured":"Bao, F., Gao, G., Yan, X., et al. (2013). Segmentation-based mongolian LVCSR approach. In: 2013 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 8136\u20138139), Vancouver, BC, Canada.","DOI":"10.1109\/ICASSP.2013.6639250"},{"key":"10034_CR5","doi-asserted-by":"crossref","unstructured":"Bu, H., Du, J., Na, X., et al. (2017). Aishell-1: An open-source mandarin speech corpus and a speech recognition baseline. In: 20th Conference of the oriental chapter of international committee for coordination and standardization of speech databases and assessment techniques (pp. 1\u20135), Seoul, South Korea.","DOI":"10.1109\/ICSDA.2017.8384449"},{"key":"10034_CR6","doi-asserted-by":"crossref","unstructured":"Bustamin, A., Indrabayu, Areni I. S., et al. (2016). Speech to text for Indonesian homophone phrase with MEL frequency cepstral coefficient. In: 2016 International conference on computational intelligence and cybernetics (pp 29\u201331), Makassar, Indonesia.","DOI":"10.1109\/CyberneticsCom.2016.7892562"},{"key":"10034_CR7","doi-asserted-by":"crossref","unstructured":"Cao, H., Ching, P. C., Lee, T., et al. (2010). Semantics-based language modeling for Cantonese\u2013English code-mixing speech recognition. In: 2010 7th international symposium on Chinese spoken language processing (pp. 246\u2013250), Tainan, Taiwan.","DOI":"10.1109\/ISCSLP.2010.5684900"},{"key":"10034_CR8","doi-asserted-by":"crossref","unstructured":"Cengiz, Y., & Ar\u0131\u00f6z, Y. (2016). An application for speech denoising using discrete wavelet transform. In: 20th national biomedical engineering meeting (BIYOMUT) (pp. 1\u20134), Izmir, Turkey.","DOI":"10.1109\/BIYOMUT.2016.7849377"},{"key":"10034_CR9","unstructured":"Chen, N., Watanabe, S., Villalba, J., et\u00a0al. (2019). Non-autoregressive transformer automatic speech recognition. arXiv:abs\/1911.04908v1."},{"key":"10034_CR10","doi-asserted-by":"crossref","unstructured":"Dong, L., Xu, S., & Xu, B. (2018). Speech-transformer: A no-recurrence sequence-to-sequence model for speech recognition. In: 2018 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 5884\u20135888). Calgary, AB, Canada.","DOI":"10.1109\/ICASSP.2018.8462506"},{"key":"10034_CR11","doi-asserted-by":"crossref","unstructured":"Dong, L., Wang, F., & Xu, B. (2019). Self-attention aligner: A latency-control end-to-end model for asr using self-attention network and chunk-hopping. In: 2019 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 5656\u20135660). Brighton, UK.","DOI":"10.1109\/ICASSP.2019.8682954"},{"key":"10034_CR12","doi-asserted-by":"publisher","first-page":"198","DOI":"10.1109\/TASLP.2020.3039600","volume":"29","author":"C Fan","year":"2021","unstructured":"Fan, C., Yi, J., Tao, J., et al. (2021). Gated recurrent fusion with joint training framework for robust end-to-end speech recognition. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 29, 198\u2013209.","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10034_CR13","doi-asserted-by":"crossref","unstructured":"Fan, R., Chu, W., Chang, P., et al. (2021). Cass-nat: Ctc alignment-based single step non-autoregressive transformer for speech recognition. In: 2021 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 5889\u20135893), Toronto, ON, Canada.","DOI":"10.1109\/ICASSP39728.2021.9413429"},{"key":"10034_CR14","doi-asserted-by":"crossref","unstructured":"Feng, X., Zhang, Y., & Glass, J. (2014). Speech feature denoising and dereverberation via deep autoencoders for noisy reverberant speech recognition. In: 2014 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 1759\u20131763). Florence, Italy.","DOI":"10.1109\/ICASSP.2014.6853900"},{"key":"10034_CR15","doi-asserted-by":"crossref","unstructured":"Fujita, Y., O, M., C, X., & Watanabe, S. (2020). Insertion-based modeling for end-to-end automatic speech recognition. In: Interspeech, (pp. 3660\u20133664).","DOI":"10.21437\/Interspeech.2020-1619"},{"key":"10034_CR16","doi-asserted-by":"crossref","unstructured":"Ghosh, P., Chingtham, T. S., & Ghose, M. K. (2016). SLHAR: A supervised learning approach for homophone ambiguity reduction from speech recognition system. In: 2016 Second international conference on research in computational intelligence and communication networks (ICRCICN) (pp. 12\u201316). Kolkata, India.","DOI":"10.1109\/ICRCICN.2016.7813543"},{"key":"10034_CR17","doi-asserted-by":"crossref","unstructured":"Ghosh, P., Chinghtham, T. S., & Ghose, M. K. (2019). Homophone ambiguity reduction from word level speech recognition using artificial immune system. In: 4th international conference on recent trends on electronics (pp. 161\u2013166). Communication Technology (RTEICT), Bangalore, India.","DOI":"10.1109\/RTEICT46194.2019.9016769"},{"key":"10034_CR18","unstructured":"Graves, A., & Jaitly, N. (2014). Towards end-to-end speech recognition with recurrent neural networks. In: International conference on machine learning (pp. 1764\u20131772), Beijing, China."},{"key":"10034_CR19","doi-asserted-by":"crossref","unstructured":"Graves, A., Fern\u00e1ndez, S., Gomez, F. J., et al. (2006). Connectionist temporal classification: Labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on machine learning (pp 369\u2013376), Pittsburgh, PA, USA.","DOI":"10.1145\/1143844.1143891"},{"key":"10034_CR20","doi-asserted-by":"crossref","unstructured":"Graves, A., rahman Mohamed, A., & Hinton, G. (2013). Speech recognition with deep recurrent neural networks. In: 2013 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 6645\u20136649), Vancouver BC, Canada.","DOI":"10.1109\/ICASSP.2013.6638947"},{"issue":"10","key":"10034_CR21","doi-asserted-by":"publisher","first-page":"2222","DOI":"10.1109\/TNNLS.2016.2582924","volume":"28","author":"K Greff","year":"2017","unstructured":"Greff, K., Srivastava, R. K., Koutn\u00edk, J., et al. (2017). LSTM: A search space odyssey. IEEE Transactions on Neural Networks and Learning Systems, 28(10), 2222\u20132232.","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"},{"issue":"12","key":"10034_CR22","doi-asserted-by":"publisher","first-page":"2313","DOI":"10.1109\/TASLP.2017.2738559","volume":"25","author":"\u0110T Grozdi\u0107","year":"2017","unstructured":"Grozdi\u0107, \u0110T., & Jovi\u010di\u0107, S. T. (2017). Whispered speech recognition using deep denoising autoencoder and inverse filtering. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 25(12), 2313\u20132322.","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10034_CR23","doi-asserted-by":"crossref","unstructured":"Gulati, A., Qin, J., Chiu, C. C., et al. (2020). Conformer: Convolution augmented transformer for speech recognition. In: Proceedings of the annual conference of the international speech communication association (pp 5036\u20135040), Shanghai, China.","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"10034_CR24","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., et al. (2016). Deep residual learning for image recognition. In: 2016 IEEE conference on computer vision and pattern recognition (pp. 770\u2013778), Las Vegas, NV, USA.","DOI":"10.1109\/CVPR.2016.90"},{"key":"10034_CR25","doi-asserted-by":"crossref","unstructured":"He, R., Ravula, A., Kanagal, B., et\u00a0al. (2020) Real-former: Transformer likes residual attention. arXiv:abs\/2012.11747v2","DOI":"10.18653\/v1\/2021.findings-acl.81"},{"issue":"6","key":"10034_CR26","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1109\/MSP.2012.2205597","volume":"29","author":"G Hinton","year":"2012","unstructured":"Hinton, G., Deng, L., Yu, D., et al. (2012). Deep neural networks for acoustic modeling in speech recognition: The shared views of four research groups. IEEE Signal Processing Magazine, 29(6), 82\u201397.","journal-title":"IEEE Signal Processing Magazine"},{"key":"10034_CR27","doi-asserted-by":"crossref","unstructured":"Hu, Y., Hou, N., Chen, C., et al. (2022). Interactive feature fusion for end-to-end noise-robust speech recognition. In: 2022 IEEE international conference on acoustics speech and signal processing (ICASSP) (pp. 6292\u20136296). Singapore.","DOI":"10.1109\/ICASSP43922.2022.9746314"},{"issue":"2","key":"10034_CR28","doi-asserted-by":"publisher","first-page":"224","DOI":"10.1109\/LSP.2017.2782270","volume":"25","author":"K Isogawa","year":"2018","unstructured":"Isogawa, K., Ida, T., Shiodera, T., et al. (2018). Deep shrinkage convolutional neural network for adaptive noise reduction. IEEE Signal Processing Letters, 25(2), 224\u2013228.","journal-title":"IEEE Signal Processing Letters"},{"key":"10034_CR29","doi-asserted-by":"crossref","unstructured":"Karita, S., Chen, N., & Hayashi, T. (2019). A comparative study on transformer vs RNN in speech applications. In: 2019 IEEE automatic speech recognition and understanding workshop (ASRU) (pp 449\u2013456), Singapore.","DOI":"10.1109\/ASRU46091.2019.9003750"},{"key":"10034_CR30","unstructured":"Kitaev, N., Kaiser, L., & Levskaya, A. (2020). Reformer: The efficient transformer. In: International conference on learning representations (ICLR). Virtual Conference, Addis Ababa, Ethiopia."},{"key":"10034_CR31","doi-asserted-by":"crossref","unstructured":"Kim, S., Hori, T., & Watanabe, S. (2017). Joint CTC-attention based end-to-end speech recognition using multi-task learning. In: 2017 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 3839\u20134835). New Orleans, LA, USA.","DOI":"10.1109\/ICASSP.2017.7953075"},{"key":"10034_CR32","doi-asserted-by":"publisher","first-page":"391","DOI":"10.1007\/s10772-021-09955-4","volume":"25","author":"M Liu","year":"2022","unstructured":"Liu, M. (2022). English speech emotion recognition method based on speech recognition. International Journal of Speech Technology, 25, 391\u2013398.","journal-title":"International Journal of Speech Technology"},{"key":"10034_CR33","doi-asserted-by":"crossref","unstructured":"Liu, Y., Fung, P., Yang, Y., et al. (2006). HKUST\/MTS: A very large scale mandarin telephone speech corpus. In: International symposium on Chinese spoken language processing (ISCSLP 2006) (pp 724\u2013735), Singapore.","DOI":"10.1007\/11939993_73"},{"key":"10034_CR34","doi-asserted-by":"crossref","unstructured":"Lv, X., Chen, S. B., & Wang, X. (2021). Adversarial training with gated convolutional neural networks for robust speech recognition. In: 2021 17th international conference on computational intelligence and security (CIS) (pp. 113\u2013117). Chengdu, China.","DOI":"10.1109\/CIS54983.2021.00032"},{"key":"10034_CR35","doi-asserted-by":"crossref","unstructured":"Miao, Y., Gowayyed, M., Na, X., et al. (2016). An empirical exploration of CTC acoustic models. In: 2016 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 2623\u20132627), Shanghai, China.","DOI":"10.1109\/ICASSP.2016.7472152"},{"key":"10034_CR36","doi-asserted-by":"crossref","unstructured":"\u00d6zkan, K., Seke, E., & I\u015f\u0131k, \u015e. (2016). A new approach for speech denoising. In: 24th signal processing and communication application conference (pp. 2109\u20132112), Zonguldak, Turkey.","DOI":"10.1109\/SIU.2016.7496188"},{"issue":"2","key":"10034_CR37","doi-asserted-by":"publisher","first-page":"257","DOI":"10.1109\/5.18626","volume":"77","author":"LR Rabiner","year":"1989","unstructured":"Rabiner, L. R. (1989). A tutorial on hidden Markov models and selected applications in speech recognition. Proceedings of the IEEE, 77(2), 257\u2013286.","journal-title":"Proceedings of the IEEE"},{"key":"10034_CR38","doi-asserted-by":"crossref","unstructured":"Ramadan, R. A. (2021). RETRACTED ARTICLE: Detecting adversarial attacks on audio-visual speech recognition using deep learning method. International Journal of Speech Technology.","DOI":"10.1007\/s10772-021-09859-3"},{"key":"10034_CR39","doi-asserted-by":"crossref","unstructured":"Sak, H., Senior, A., & Beaufays, F. (2014). Long short-term memory recurrent neural network architectures for large scale acoustic modeling. In: Fifteenth annual conference of the international speech communication association (pp. 338\u2013342), Singapore.","DOI":"10.21437\/Interspeech.2014-80"},{"key":"10034_CR40","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., et al. (2017). Attention is all you need. In M. I. Jordan, Y. LeCun, & S. A. Solla (Eds.), Advances in neural information processing systems (pp. 5999\u20136009). MIT Press."},{"key":"10034_CR41","unstructured":"Wang, J., Wang, D., Chen, Y., et\u00a0al. (2019). Noise robustness automatic speech recognition with convolutional neural network and time delay neural network. Journal of the Audio Engineering Society."},{"key":"10034_CR42","doi-asserted-by":"crossref","unstructured":"Wilson, K. W., Raj, B., Smaragdis, P., et al. (2008). Speech denoising using nonnegative matrix factorization with priors. In: 2008 IEEE international conference on acoustics speech and signal processing (pp. 4029\u20134032). Las Vegas, NV, USA.","DOI":"10.1109\/ICASSP.2008.4518538"},{"key":"10034_CR43","doi-asserted-by":"crossref","unstructured":"Winata, G. I., Cahyawijaya, S., Lin, Z., et al. (2020). Lightweight and efficient end-to-end speech recognition using low-rank transformer. In: 2020 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp 6144\u20136148), Barcelona, Spain.","DOI":"10.1109\/ICASSP40776.2020.9053878"},{"key":"10034_CR44","doi-asserted-by":"publisher","first-page":"149","DOI":"10.1007\/s10772-020-09671-5","volume":"23","author":"GT Yadava","year":"2020","unstructured":"Yadava, G. T., & Jayanna, H. S. (2020). Enhancements in automatic Kannada speech recognition system by background noise elimination and alternate acoustic modelling. International Journal of Speech Technology, 23, 149\u2013167.","journal-title":"International Journal of Speech Technology"},{"key":"10034_CR45","doi-asserted-by":"crossref","unstructured":"Zhang, H., Bao, F., Gao, G., et al. (2016). Comparison on neural network based acoustic model in Mongolian speech recognition. In: 2016 international conference on Asian language processing (IALP) (pp. 1\u20135), Tainan, Taiwan.","DOI":"10.1109\/IALP.2016.7875921"},{"issue":"7","key":"10034_CR46","doi-asserted-by":"publisher","first-page":"4681","DOI":"10.1109\/TII.2019.2943898","volume":"16","author":"M Zhao","year":"2020","unstructured":"Zhao, M., Zhong, S., Fu, X., et al. (2020). Deep residual shrinkage networks for fault diagnosis. IEEE Transactions on Industrial Informatics, 16(7), 4681\u20134690.","journal-title":"IEEE Transactions on Industrial Informatics"},{"key":"10034_CR47","doi-asserted-by":"crossref","unstructured":"Zhikui, D., Guozhi, G., & Jiawei, C. (2022). Dual-residual transformer network for speech recognition. Journal of the Audio Engineering Society 70(10), 871\u2013881.","DOI":"10.17743\/jaes.2022.0029"},{"key":"10034_CR48","doi-asserted-by":"publisher","first-page":"563","DOI":"10.1007\/s10772-018-9516-7","volume":"21","author":"X Zhong","year":"2018","unstructured":"Zhong, X., Dai, Y., Dai, Y., et al. (2018). Study on processing of wavelet speech denoising in speech recognition system. International Journal of Speech Technology, 21, 563\u2013569.","journal-title":"International Journal of Speech Technology"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-023-10034-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10772-023-10034-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-023-10034-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,7,31]],"date-time":"2023-07-31T11:17:29Z","timestamp":1690802249000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10772-023-10034-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,26]]},"references-count":48,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2023,7]]}},"alternative-id":["10034"],"URL":"https:\/\/doi.org\/10.1007\/s10772-023-10034-z","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"value":"1381-2416","type":"print"},{"value":"1572-8110","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,6,26]]},"assertion":[{"value":"4 August 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 May 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 June 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"All the authors do not have any possible conflicts of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}