{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,18]],"date-time":"2026-05-18T14:16:26Z","timestamp":1779113786861,"version":"3.51.4"},"reference-count":43,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2026,1,29]],"date-time":"2026-01-29T00:00:00Z","timestamp":1769644800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,29]],"date-time":"2026-01-29T00:00:00Z","timestamp":1769644800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-026-21303-9","type":"journal-article","created":{"date-parts":[[2026,1,29]],"date-time":"2026-01-29T14:56:56Z","timestamp":1769698616000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Noise-augmented transformer-based automatic speech recognizer using a novel noise distillation system"],"prefix":"10.1007","volume":"85","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4485-3393","authenticated-orcid":false,"given":"Bachchu","family":"Paul","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Santanu","family":"Phadikar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Utpal","family":"Nandi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,1,29]]},"reference":[{"key":"21303_CR1","doi-asserted-by":"crossref","unstructured":"Yan BC, Wang HW, Wang YC, Li JT, Lin CH, Chen B (2023) Preserving phonemic distinctions for ordinal regression: A novel loss function for automatic pronunciation assessment. In: 2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp 1\u20137. IEEE","DOI":"10.1109\/ASRU57964.2023.10389777"},{"issue":"12","key":"21303_CR2","doi-asserted-by":"publisher","first-page":"6875","DOI":"10.1007\/s00521-024-09435-1","volume":"36","author":"VN Vitale","year":"2024","unstructured":"Vitale VN, Cutugno F, Origlia A, Coro G (2024) Exploring emergent syllables in end-to-end automatic speech recognizers through model explainability technique. Neural Comput Appl 36(12):6875\u20136901","journal-title":"Neural Comput Appl"},{"key":"21303_CR3","first-page":"1","volume":"1","author":"S Qin","year":"2022","unstructured":"Qin S, Wang L, Li S, Dang J, Pan L (2022) Improving low-resource Tibetan end-to-end ASR by multilingual and multilevel unit modeling. EURASIP J Audio Speech Music Process 1:1\u201310","journal-title":"EURASIP J Audio Speech Music Process"},{"issue":"1","key":"21303_CR4","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s40537-019-0235-y","volume":"6","author":"M Roccetti","year":"2019","unstructured":"Roccetti M, Delnevo G, Casini L, Cappiello G (2019) Is bigger always better? A controversial journey to the center of machine learning design, with uses and misuses of big data for predicting water meter failures. J Big Data 6(1):1\u201323","journal-title":"J Big Data"},{"key":"21303_CR5","doi-asserted-by":"crossref","unstructured":"Fujita Y, Watanabe S, Omachi M, Chan X (2020) Insertion-based modeling for end-to-end automatic speech recognition\u2019, arXiv preprint arXiv:2005.13211","DOI":"10.21437\/Interspeech.2020-1619"},{"key":"21303_CR6","doi-asserted-by":"crossref","unstructured":"Gao Z, Zhang S, McLoughlin I, Yan Z (2022) Paraformer: fast and accurate parallel transformer for non-autoregressive end-to-end speech recognition. arXiv preprint arXiv:2206.08317","DOI":"10.21437\/Interspeech.2022-9996"},{"key":"21303_CR7","unstructured":"Shahgir H, Sayeed KS, Zaman TA (2022) Applying wav2vec2 for speech recognition on bengali common voices dataset. arXiv preprint arXiv:2209.06581"},{"key":"21303_CR8","doi-asserted-by":"crossref","unstructured":"Lu L, Zhang X, Cho K, Renals S (2015) A study of the recurrent neural network encoder-decoder for large vocabulary speech recognition. In: INTERSPEECH 2015 16th Annual Conference of the International Speech Communication Association, pp 3249\u20133253","DOI":"10.21437\/Interspeech.2015-654"},{"key":"21303_CR9","doi-asserted-by":"crossref","unstructured":"Dong L, Xu S, Xu B (2018) Speech-transformer : a no-recurrence sequence-to-sequence model for speech recognition. In 2018 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp. 5884\u20135888. IEEE","DOI":"10.1109\/ICASSP.2018.8462506"},{"key":"21303_CR10","doi-asserted-by":"publisher","first-page":"198","DOI":"10.1109\/TASLP.2020.3039600","volume":"29","author":"C Fan","year":"2020","unstructured":"Fan C, Yi J, Tao J, Tian Z, Liu B, Wen Z (2020) Gated recurrent fusion with joint training framework for robust end-to-end speech recognition. IEEE\/ACM Transactions on Audio, Speech, and Language Processing 29:198\u2013209","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"21303_CR11","doi-asserted-by":"publisher","unstructured":"Bu H, Du J, Na X, Wu B, Zheng H (2024) AISHELL-1 [Dataset]. TIB. https:\/\/doi.org\/10.57702\/625tb6gv","DOI":"10.57702\/625tb6gv"},{"key":"21303_CR12","doi-asserted-by":"crossref","unstructured":"Nakatani T (2019), September Improving transformer-based end-to-end speech recognition with connectionist temporal classification and language model integration. In proc. INTERSPEECH (Vol 2019, pp 1408\u20131412)","DOI":"10.21437\/Interspeech.2019-1938"},{"issue":"3","key":"21303_CR13","doi-asserted-by":"publisher","first-page":"1","DOI":"10.56345\/ijrdv9n301","volume":"9","author":"A Rista","year":"2022","unstructured":"Rista A, Kadriu A (2022) A model for Albanian speech recognition using end-to-end deep learning techniques. Interdisciplinary J Res Dev 9(3):1\u20131","journal-title":"Interdisciplinary J Res Dev"},{"key":"21303_CR14","doi-asserted-by":"publisher","unstructured":"Soky S K., Gong Z, Li (2022) Nict-Tib1: a public speech corpus of lhasa dialect for benchmarking tibetan language speech recognition systems. 25th Conference of the Oriental COCOSDA International Committee for the Co-ordination and Standardisation of Speech Databases and Assessment Techniques (O-COCOSDA), Hanoi, pp 1\u20135. https:\/\/doi.org\/10.1109\/O-COCOSDA202257103.2022.9997917","DOI":"10.1109\/O-COCOSDA202257103.2022.9997917"},{"key":"21303_CR15","doi-asserted-by":"crossref","unstructured":"Gulati A, Qin J, Chiu CC, Parmar N, Zhang Y, Yu J, Pang R (2020) Conformer: Convolution-augmented transformer for speech recognition. arXiv preprint arXiv:2005.08100.","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"21303_CR16","doi-asserted-by":"publisher","DOI":"10.1109\/access.2024.3371478","author":"AhnafMozib Samin","year":"2024","unstructured":"Samin Ahnaf Mozib, Kobir M. Humayon, Shahriyar Rafee Md Mushtaq, Ahmed M. Firoz, Hasan Mehedi, Ghosh Partha, Kibria Shafkat, Rahman M. Shahidur (2024) BanSpeech: a multi-domain Bangla speech recognition benchmark towards robust performance in challenging conditions. IEEE Access. https:\/\/doi.org\/10.1109\/access.2024.3371478","journal-title":"IEEE Access"},{"key":"21303_CR17","doi-asserted-by":"publisher","first-page":"124119","DOI":"10.1016\/j.eswa.2024.124119","volume":"252","author":"A Loubser","year":"2024","unstructured":"Loubser A, De Villiers P, De Freitas A (2024) End-to-end automated speech recognition using a character based small scale transformer architecture. Expert Syst Appl 252:124119","journal-title":"Expert Syst Appl"},{"key":"21303_CR18","doi-asserted-by":"publisher","DOI":"10.1109\/taslpro.2025.3551083","author":"R Ma","year":"2025","unstructured":"Ma R, Qian M, Gales M, Knill K (2025) Asr error correction using large language models. IEEE Trans Audio Speech Lang Process. https:\/\/doi.org\/10.1109\/taslpro.2025.3551083","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"21303_CR19","unstructured":"Samiul A, Sushmit A, Abdullah Z, Nakkhatra S, Mobassir Hossen AMD, Morshed Mehnaz S, Reasat ST, Humayun AI (2022) Bengali common voice speech dataset for automatic speech recognition. ArXiv Preprint ArXiv:2206.14053"},{"key":"21303_CR20","unstructured":"Talukder Showrav T (2022) An automatic speech recognition system for bengali language based on Wav2Vec2 and transfer learning. arXiv e-prints. arXiv: 2209.08119"},{"key":"21303_CR21","doi-asserted-by":"crossref","unstructured":"Rakib M, Hossain MI, Mohammed N, Rahman F (2023) Bangla-Wave: improving bangla automatic speech recognition utilizing N-gram language models. In: Proceedings of the 12th International Conference on Software and Computer Applications, pp 297\u2013301","DOI":"10.1145\/3587828.3587872"},{"key":"21303_CR22","unstructured":"Chorowski JK, Bahdanau D, Serdyuk D, Cho K, Bengio Y 2015 \u2018Attention-based models for speech recognition.\u00a0Adv Neural Inf Process Syst 28"},{"key":"21303_CR23","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. Adv Neural Inf Process Syst 30"},{"key":"21303_CR24","unstructured":"OpenSLR (n.d.). Resource 53 \u2013 Bengali Common Voice Dataset. Retrieved from https:\/\/openslr.elda.org\/53\/\/"},{"key":"21303_CR25","doi-asserted-by":"publisher","unstructured":"Kjartansson O, Sarin S, Pipatsrisawat K, Jansche M, Ha L (2018) Crowd-sourced speech corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali. Proc 6th Intl Workshop Spok Lang Technol Under-Resourced Lang (SLTU), pp 52\u201355. https:\/\/doi.org\/10.21437\/SLTU.2018-11","DOI":"10.21437\/SLTU.2018-11"},{"key":"21303_CR26","doi-asserted-by":"crossref","unstructured":"Meister A, Novikov M, Karpov N, Bakhturina E, Lavrukhin V, Ginsburg B (2023) Librispeech-pc: Benchmark for evaluation of punctuation and capitalization capabilities of end-to-end asr models. In: 2023 IEEE automatic speech recognition and understanding workshop (ASRU). IEEE, pp 1\u20137","DOI":"10.1109\/ASRU57964.2023.10389666"},{"key":"21303_CR27","doi-asserted-by":"publisher","first-page":"2454","DOI":"10.1007\/s00034-023-02570-5","volume":"43","author":"B Paul","year":"2024","unstructured":"Paul B, Phadikar SRASR (2024) A novel low-cost reconstructed attention-based end-to-end speech recognizer. Circuits Syst Signal Process 43:2454\u20132476. https:\/\/doi.org\/10.1007\/s00034-023-02570-5","journal-title":"Circuits Syst Signal Process"},{"key":"21303_CR28","doi-asserted-by":"publisher","unstructured":"Paul B, Phadikar S, Bera S (2021) Indian regional spoken language identification using deep learning approach. In: Giri, D., Buyya, R., Ponnusamy, S., De, D., Adamatzky, A., Abawajy, J.H. (eds) Proceedings of the Sixth International Conference on Mathematics and Computing. Advances in Intelligent Systems and Computing, vol 1262. Springer. https:\/\/doi.org\/10.1007\/978-981-15-8061-1_21","DOI":"10.1007\/978-981-15-8061-1_21"},{"key":"21303_CR29","doi-asserted-by":"publisher","first-page":"119293","DOI":"10.1016\/j.eswa.2022.119293","volume":"215","author":"S Reza","year":"2023","unstructured":"Reza S, Ferreira MC, Machado JJM, Tavares JMR (2023) A customized residual neural network and bi-directional gated recurrent unit-based automatic speech recognition model. Expert Syst Appl 215:119293","journal-title":"Expert Syst Appl"},{"issue":"1","key":"21303_CR30","doi-asserted-by":"publisher","first-page":"1669","DOI":"10.1007\/s11042-023-15598-1","volume":"83","author":"B Paul","year":"2024","unstructured":"Paul B, Phadikar S (2024) A hybrid feature-extracted deep CNN with reduced parameters substitutes an end-to-end CNN for the recognition of spoken Bengali digits. Multimedia Tools Appl 83(1):1669\u20131692","journal-title":"Multimedia Tools Appl"},{"key":"21303_CR31","first-page":"1","volume":"61","author":"K Wu","year":"2023","unstructured":"Wu K, Fan J, Ye P, Zhu M (2023) Hyperspectral image classification using spectral\u2013spatial token enhanced transformer with hash-based positional embedding. IEEE Trans Geosci Remote Sens 61:1\u201316","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"21303_CR32","doi-asserted-by":"publisher","DOI":"10.1561\/116.00000001","author":"Q Wang","year":"2023","unstructured":"Wang Q, Zhou X, Li H (2023) Speech-and-text transformer: exploiting unpaired text for end-to-end speech recognition. APSIPA Trans Signal Inf Process. https:\/\/doi.org\/10.1561\/116.00000001","journal-title":"APSIPA Trans Signal Inf Process"},{"issue":"1","key":"21303_CR33","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s13636-024-00368-0","volume":"2024","author":"U Irshad","year":"2024","unstructured":"Irshad U, Mahum R, Ganiyu I, Butt FS, Hidri L, Ali TG, El-Sherbeeny AM (2024) UTran-dsr: a novel transformer-based model using feature enhancement for dysarthric speech recognition. EURASIP J Audio Speech Music Process 2024(1):1\u201318","journal-title":"EURASIP J Audio Speech Music Process"},{"key":"21303_CR34","doi-asserted-by":"publisher","DOI":"10.1007\/s11227-024-05898-0","author":"Y Wan","year":"2024","unstructured":"Wan Y, Yi L, Jiang B et al (2024) AENet: attention enhancement network for industrial defect detection in complex and sensitive scenarios. J Supercomput. https:\/\/doi.org\/10.1007\/s11227-024-05898-0","journal-title":"J Supercomput"},{"key":"21303_CR35","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1016\/j.neucom.2021.04.038","volume":"454","author":"J Li","year":"2021","unstructured":"Li J, Wang X, Tu Z, Lyu MR (2021) On the diversity of multi-head attention. Neurocomputing 454:14\u201324","journal-title":"Neurocomputing"},{"key":"21303_CR36","doi-asserted-by":"publisher","unstructured":"Sainburg T (2019)\u00a0timsainb\/noisereduce: v1.0 (db94fe2). Zenodo.\u00a0https:\/\/doi.org\/10.5281\/zenodo.3243139","DOI":"10.5281\/zenodo.3243139"},{"key":"21303_CR37","doi-asserted-by":"crossref","unstructured":"Yang YY, Hira M, Ni Z, Astafurov A, Chen C, Puhrsch C. and, Quenneville-B\u00e9lair V (2022) Torchaudio: building blocks for audio and speech processing. In: ICASSP 2022\u20132022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, pp 6982\u20136986","DOI":"10.1109\/ICASSP43922.2022.9747236"},{"key":"21303_CR38","doi-asserted-by":"crossref","unstructured":"Chang X, Qian Y, Yu K, Watanabe S (2019) End-to-end monaural multi-speaker ASR system without pretraining. In: ICASSP 2019\u20132019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, pp 6256\u20136260","DOI":"10.1109\/ICASSP.2019.8682822"},{"key":"21303_CR39","doi-asserted-by":"publisher","first-page":"216","DOI":"10.1016\/j.patrec.2024.01.002","volume":"178","author":"X Wang","year":"2024","unstructured":"Wang X, Mi J, Li B, Zhao Y, Meng J (2024) CATNet: Cross-modal fusion for audio\u2013visual speech recognition. Pattern Recognit Lett 178:216\u2013222","journal-title":"Pattern Recognit Lett"},{"issue":"21","key":"21303_CR40","doi-asserted-by":"publisher","first-page":"7522","DOI":"10.3390\/app10217522","volume":"10","author":"L Chowdhury","year":"2020","unstructured":"Chowdhury L, Zunair H, Mohammed N (2020) Robust deep speaker recognition: learning latent representation with joint angular margin loss. Appl Sci 10(21):7522","journal-title":"Appl Sci"},{"key":"21303_CR41","doi-asserted-by":"crossref","unstructured":"Hasan MM, Islam MA, Chowdary S, Rahman MS (2019) Towards lexicon-free bangla automatic speech recognition system. In 2019 International Conference on Bangla Speech and Language Processing (ICBSLP). IEEE, pp 1\u20136","DOI":"10.1109\/ICBSLP47725.2019.201544"},{"issue":"6","key":"21303_CR42","first-page":"4800","volume":"12","author":"M Mubassira","year":"2021","unstructured":"Mubassira M, Das AK (2021) Implementation of recurrent neural network with language model for automatic articulation identification system in Bangla. Int J Adv Netw Appl 12(6):4800\u20134808","journal-title":"Int J Adv Netw Appl"},{"key":"21303_CR43","doi-asserted-by":"crossref","unstructured":"Hossain SK, Rihan MR, Imtiaz A, Boni PK, Gomes DJ (2024) Enhancing Bangla local speech-to-text conversion using fine-tuning Wav2vec 2.0 with OpenSLR and self-compiled datasets through transfer learning","DOI":"10.46254\/BA07.20240161"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-026-21303-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-026-21303-9","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-026-21303-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,29]],"date-time":"2026-01-29T14:57:00Z","timestamp":1769698620000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-026-21303-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,29]]},"references-count":43,"journal-issue":{"issue":"2","published-online":{"date-parts":[[2026,2]]}},"alternative-id":["21303"],"URL":"https:\/\/doi.org\/10.1007\/s11042-026-21303-9","relation":{},"ISSN":["1573-7721"],"issn-type":[{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,1,29]]},"assertion":[{"value":"8 September 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 August 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 September 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 January 2026","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Not applicable.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"60"}}