{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T17:27:57Z","timestamp":1767374877355,"version":"3.37.3"},"reference-count":44,"publisher":"Springer Science and Business Media LLC","issue":"28","license":[{"start":{"date-parts":[[2022,5,20]],"date-time":"2022-05-20T00:00:00Z","timestamp":1653004800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"},{"start":{"date-parts":[[2022,5,20]],"date-time":"2022-05-20T00:00:00Z","timestamp":1653004800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/100008205","name":"Auckland University of Technology","doi-asserted-by":"crossref","id":[{"id":"10.13039\/100008205","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2022,11]]},"DOI":"10.1007\/s11042-022-12136-3","type":"journal-article","created":{"date-parts":[[2022,5,24]],"date-time":"2022-05-24T10:02:54Z","timestamp":1653386574000},"page":"41295-41308","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":14,"title":["A hybrid CTC+Attention model based on end-to-end framework for multilingual speech recognition"],"prefix":"10.1007","volume":"81","author":[{"given":"Sendong","family":"Liang","sequence":"first","affiliation":[]},{"given":"Wei Qi","family":"Yan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,5,20]]},"reference":[{"doi-asserted-by":"crossref","unstructured":"Abdel-Hamid O, Mohamed AR, Jiang H, Penn G (2012) Applying convolutional neural networks concepts to hybrid NN-HMM model for speech recognition. In: IEEE International conference on acoustics, speech and signal processing (ICASSP). IEEE, pp 4277\u20134280","key":"12136_CR1","DOI":"10.1109\/ICASSP.2012.6288864"},{"unstructured":"Boden M (2002) A guide to recurrent neural networks and backpropagation. The Dallas project: SICS technical report","key":"12136_CR2"},{"doi-asserted-by":"crossref","unstructured":"Chan JY, Ching P, Lee T, Meng HM (2004) Detection of language boundary in code-switching utterances by bi-phone probabilities. In: International symposium on chinese spoken language processing. IEEE, pp. 293\u2013296","key":"12136_CR3","DOI":"10.1109\/CHINSL.2004.1409644"},{"unstructured":"Chan W, Jaitly N, Le QV, Vinyals O (2015) Listen, attend and spell. arXiv:http:\/\/arxiv.org\/abs\/1508.01211","key":"12136_CR4"},{"unstructured":"Chiu CC, Raffel C (2017) Monotonic chunkwise attention. arXiv:http:\/\/arxiv.org\/abs\/1712.05382","key":"12136_CR5"},{"doi-asserted-by":"crossref","unstructured":"Eyben F, W\u00f6llmer M, Schuller B, Graves A (2009) From speech to letters-using a novel neural network architecture for grapheme based ASR. In: IEEE Workshop on automatic speech recognition & understanding. IEEE, pp 376\u2013380","key":"12136_CR6","DOI":"10.1109\/ASRU.2009.5373257"},{"unstructured":"Fu L, Li X, Zi L (2020)","key":"12136_CR7"},{"doi-asserted-by":"crossref","unstructured":"Georgescu AL, Cucu H, Burileanu C (2019) Kaldi-based DNN architectures for speech recognition in Romanian. In: International conference on speech technology and human-computer dialogue (sped). IEEE, pp 1\u20136","key":"12136_CR8","DOI":"10.1109\/SPED.2019.8906555"},{"issue":"7","key":"12136_CR9","doi-asserted-by":"publisher","first-page":"1527","DOI":"10.1162\/neco.2006.18.7.1527","volume":"18","author":"GE Hinton","year":"2006","unstructured":"Hinton GE, Osindero S, Teh YW (2006) A fast learning algorithm for deep belief nets. Neural Comput 18(7):1527\u20131554","journal-title":"Neural Comput"},{"unstructured":"iFLYTEK Co., Ltd: Online TTS WebAPI. Website (2020). https:\/\/global.xfyun.cn\/products\/online_tts","key":"12136_CR10"},{"key":"12136_CR11","first-page":"68","volume":"67","author":"CA Jason","year":"2020","unstructured":"Jason CA, Kumar S (2020) An appraisal on speech and emotion recognition technologies based on machine learning. Language 67:68","journal-title":"Language"},{"doi-asserted-by":"crossref","unstructured":"Kim S, Hori T, Watanabe S (2017) Joint CTC-attention based end-to-end speech recognition using multi-task learning. In: IEEE International conference on acoustics, speech and signal processing. IEEE, pp 4835\u20134839","key":"12136_CR12","DOI":"10.1109\/ICASSP.2017.7953075"},{"doi-asserted-by":"crossref","unstructured":"Li B, Chang Sy , Sainath TN, Pang R, He Y , Strohman T, Wu Y (2020) Towards fast and accurate streaming end-to-end ASR. In: IEEE International conference on acoustics, speech and signal processing (ICASSP). IEEE, pp 6069\u20136073","key":"12136_CR13","DOI":"10.1109\/ICASSP40776.2020.9054715"},{"doi-asserted-by":"crossref","unstructured":"Li J, Zhao R, Hu H, Gong Y (2019) Improving RNN transducer modeling for end-to-end speech recognition. In: IEEE Automatic speech recognition and understanding workshop. IEEE, pp 114\u2013121","key":"12136_CR14","DOI":"10.1109\/ASRU46091.2019.9003906"},{"key":"12136_CR15","volume-title":"Multilingual speech recognition based on the end-to-end framework (Master\u2019s Thesis)","author":"S Liang","year":"2021","unstructured":"Liang S (2021) Multilingual speech recognition based on the end-to-end framework (Master\u2019s Thesis). Auckland University of Technology, New Zealand"},{"doi-asserted-by":"crossref","unstructured":"Lin CH, Lee LS, Ting PY (1993) A new framework for recognition of Mandarin syllables with tones using sub-syllabic units. In: IEEE International conference on acoustics, speech, and signal processing, vol. 2. IEEE, pp 227\u2013230","key":"12136_CR16","DOI":"10.1109\/ICASSP.1993.319276"},{"doi-asserted-by":"crossref","unstructured":"Liu Z, Chen Q, Hu H, Tang H, Zou Y (2019) Teacher-student learning and post-processing for robust biLSTM mask-based acoustic beamforming. In: International conference on neural information processing. Springer, pp. 522\u2013533","key":"12136_CR17","DOI":"10.1007\/978-3-030-36718-3_44"},{"key":"12136_CR18","doi-asserted-by":"publisher","first-page":"195","DOI":"10.1016\/j.csl.2016.06.007","volume":"41","author":"AL Maas","year":"2017","unstructured":"Maas AL, Qi P, Xie Z, Hannun AY, Lengerich CT, Jurafsky D, Ng AY (2017) Building DNN acoustic models for large vocabulary speech recognition. Comput Speech Lang 41:195\u2013213","journal-title":"Comput Speech Lang"},{"doi-asserted-by":"crossref","unstructured":"Manaswi NK, Manaswi NK, John S (2018) Deep learning with applications using python. Springer","key":"12136_CR19","DOI":"10.1007\/978-1-4842-3516-4"},{"unstructured":"Mansikkaniemi A (2010) Acoustic model and language model adaptation for a mobile dictation service (master\u2019s thesis). Aalto university","key":"12136_CR20"},{"doi-asserted-by":"crossref","unstructured":"Miao Y, Gowayyed M, Metze F (2015) EESEN: End-to-end Speech recognition using deep RNN models and WFST-based decoding. In: IEEE Workshop on automatic speech recognition and understanding (ASRU). IEEE, pp 167\u2013174","key":"12136_CR21","DOI":"10.1109\/ASRU.2015.7404790"},{"doi-asserted-by":"crossref","unstructured":"Mikolov T, Karafi\u00e1t M, Burget L, \u010cernocky\u0300 J, Khudanpur S (2010) Recurrent neural network based language model. In: Annual conference of the International speech communication association","key":"12136_CR22","DOI":"10.21437\/Interspeech.2010-343"},{"unstructured":"Mohamed Ar, Dahl G, Hinton G (2009) Deep belief networks for phone recognition. In: NIPS Workshop on deep learning for speech recognition and related applications, vol. 1. Vancouver, Canada, p 39","key":"12136_CR23"},{"doi-asserted-by":"crossref","unstructured":"Moritz N, Hori T, Le Roux J (2019) Triggered attention for end-to-end speech recognition. In: IEEE International conference on acoustics, speech and signal processing. IEEE, pp 5666\u20135670","key":"12136_CR24","DOI":"10.1109\/ICASSP.2019.8683510"},{"doi-asserted-by":"crossref","unstructured":"Panayotov V, Chen G, Povey D, Khudanpur S (2015) Librispeech: an ASR corpus based on public domain audio books. In: IEEE International conference on acoustics, speech and signal processing. IEEE, pp 5206\u20135210","key":"12136_CR25","DOI":"10.1109\/ICASSP.2015.7178964"},{"issue":"1","key":"12136_CR26","doi-asserted-by":"publisher","first-page":"1261","DOI":"10.1515\/jisys-2018-0372","volume":"29","author":"V Passricha","year":"2020","unstructured":"Passricha V, Aggarwal RK (2020) A hybrid of deep cnn and bidirectional LSTM for automatic speech recognition. J Intell Syst 29(1):1261\u20131274","journal-title":"J Intell Syst"},{"doi-asserted-by":"crossref","unstructured":"Petridis S, Stafylakis T, Ma P, Tzimiropoulos G, Pantic M (2018) Audio-visual speech recognition with a hybrid CTC\/attention architecture. In: IEEE Spoken language technology workshop. IEEE, pp 513\u2013520","key":"12136_CR27","DOI":"10.1109\/SLT.2018.8639643"},{"unstructured":"Povey D, Ghoshal A, Boulianne G, Burget L, Glembek O, Goel N, Hannemann M, Motlicek P, Qian Y, Schwarz P et al (2011) The Kaldi speech recognition toolkit. In: IEEE Workshop on automatic speech recognition and understanding. IEEE signal processing society","key":"12136_CR28"},{"doi-asserted-by":"crossref","unstructured":"Qu Z, Haghani P, Weinstein E, Moreno P (2017) Syllable-based acoustic modeling with CTC-SMBR-LSTM. In: IEEE Automatic speech recognition and understanding workshop. IEEE, pp 173\u2013177","key":"12136_CR29","DOI":"10.1109\/ASRU.2017.8268932"},{"doi-asserted-by":"crossref","unstructured":"Senior A, Sak H, Shafran I (2015) Context dependent phone models for LSTM RNN acoustic modelling. In: IEEE International conference on acoustics, speech and signal processing. IEEE, pp 4585\u20134589","key":"12136_CR30","DOI":"10.1109\/ICASSP.2015.7178839"},{"doi-asserted-by":"crossref","unstructured":"Shi F, Cheng X, Chen X (2012) The summarize of improved HMM Model. In: International conference on computer and information application, pp. 627\u2013630","key":"12136_CR31","DOI":"10.2991\/iccia.2012.151"},{"key":"12136_CR32","doi-asserted-by":"publisher","first-page":"101158","DOI":"10.1016\/j.csl.2020.101158","volume":"66","author":"P Smit","year":"2021","unstructured":"Smit P, Virpioja S, Kurimo M (2021) Advances in subword-based HMM-DNN speech recognition across languages. Comput Speech Lang 66:101158","journal-title":"Comput Speech Lang"},{"doi-asserted-by":"crossref","unstructured":"Sundermeyer M, Oparin I, Gauvain JL, Freiberg B, Schl\u00fcter R, Ney H (2013) Comparison of feedforward and recurrent neural network language models. In: IEEE International conference on acoustics, speech and signal processing. IEEE, pp 8430\u20138434","key":"12136_CR33","DOI":"10.1109\/ICASSP.2013.6639310"},{"unstructured":"TAL Education Group: TAL CS Auto Speech Recognition Data set. Website (2019). https:\/\/ai.100tal.com\/dataset","key":"12136_CR34"},{"doi-asserted-by":"crossref","unstructured":"Ueno S, Inaguma H, Mimura M, Kawahara T (2018) Acoustic-to-word attention-based model complemented with character-level CTC-based model. In: IEEE International conference on acoustics, speech and signal processing. IEEE, pp 5804\u20135808","key":"12136_CR35","DOI":"10.1109\/ICASSP.2018.8462576"},{"unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser L, Polosukhin I (2017) Attention is all you need. arXiv:http:\/\/arxiv.org\/abs\/1706.03762","key":"12136_CR36"},{"issue":"8","key":"12136_CR37","doi-asserted-by":"publisher","first-page":"1018","DOI":"10.3390\/sym11081018","volume":"11","author":"D Wang","year":"2019","unstructured":"Wang D, Wang X, Lv S (2019) An overview of end-to-end automatic speech recognition. Symmetry 11(8):1018","journal-title":"Symmetry"},{"doi-asserted-by":"crossref","unstructured":"Wang W, Wang G, Bhatnagar A, Zhou Y, Xiong C, Socher R (2020) An investigation of phone-based subword units for end-to-end speech recognition. arXiv:http:\/\/arxiv.org\/abs\/2004.04290","key":"12136_CR38","DOI":"10.21437\/Interspeech.2020-1873"},{"doi-asserted-by":"crossref","unstructured":"Watanabe S, Hori T, Karita S, Hayashi T, Nishitoba J, Unno Y, Soplin NEY, Heymann J, Wiesner M, Chen N et al (2018) ESPnet: End-to-end speech processing toolkit. arXiv:http:\/\/arxiv.org\/abs\/1804.00015","key":"12136_CR39","DOI":"10.21437\/Interspeech.2018-1456"},{"doi-asserted-by":"crossref","unstructured":"Woodland PC, Odell JJ, Valtchev V, Young SJ (1994) Large vocabulary continuous speech recognition using HTK. In: IEEE International conference on acoustics, speech and signal processing, vol. 2. IEEE, pp II\u2013125","key":"12136_CR40","DOI":"10.1109\/ICASSP.1994.389562"},{"issue":"4","key":"12136_CR41","doi-asserted-by":"publisher","first-page":"858","DOI":"10.1109\/TASLP.2014.2310353","volume":"22","author":"CH Wu","year":"2014","unstructured":"Wu CH, Shen HP, Yang YT (2014) Chinese-english phone set construction for code-switching ASR using acoustic and DNN-extracted articulatory features. IEEE\/ACM Transactions on Audio, Speech, and Language Processing 22(4):858\u2013862","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"doi-asserted-by":"crossref","unstructured":"Yan WQ (2021) Computational methods for deep learning. Springer","key":"12136_CR42","DOI":"10.1007\/978-3-030-61081-4"},{"doi-asserted-by":"crossref","unstructured":"Zenkel T, Sanabria R, Metze F, Waibel A (2017) Subword and crossword units for CTC acoustic models. arXiv:http:\/\/arxiv.org\/abs\/1712.06855","key":"12136_CR43","DOI":"10.21437\/Interspeech.2018-2057"},{"unstructured":"Zheng Y, Yang X, Dang X (2020) Homophone-based label smoothing in end-to-end automatic speech recognition. arXiv:http:\/\/arxiv.org\/abs\/2004.03437","key":"12136_CR44"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-022-12136-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-022-12136-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-022-12136-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,25]],"date-time":"2022-10-25T10:04:18Z","timestamp":1666692258000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-022-12136-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,5,20]]},"references-count":44,"journal-issue":{"issue":"28","published-print":{"date-parts":[[2022,11]]}},"alternative-id":["12136"],"URL":"https:\/\/doi.org\/10.1007\/s11042-022-12136-3","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"type":"print","value":"1380-7501"},{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2022,5,20]]},"assertion":[{"value":"16 May 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 August 2021","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 January 2022","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 May 2022","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}