{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T19:28:02Z","timestamp":1774639682974,"version":"3.50.1"},"reference-count":44,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2023,12,24]],"date-time":"2023-12-24T00:00:00Z","timestamp":1703376000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,12,24]],"date-time":"2023-12-24T00:00:00Z","timestamp":1703376000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Circuits Syst Signal Process"],"published-print":{"date-parts":[[2024,4]]},"DOI":"10.1007\/s00034-023-02570-5","type":"journal-article","created":{"date-parts":[[2023,12,24]],"date-time":"2023-12-24T16:02:00Z","timestamp":1703433720000},"page":"2454-2476","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["RAttSR: A Novel Low-Cost Reconstructed Attention-Based End-to-End Speech Recognizer"],"prefix":"10.1007","volume":"43","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4485-3393","authenticated-orcid":false,"given":"Bachchu","family":"Paul","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Santanu","family":"Phadikar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,12,24]]},"reference":[{"key":"2570_CR1","doi-asserted-by":"publisher","first-page":"8717","DOI":"10.1109\/TPAMI.2018.2889052","volume":"44","author":"T Afouras","year":"2018","unstructured":"T. Afouras, J.S. Chung, A. Senior, O. Vinyals, A. Zisserman, Deep audio-visual speech recognition. IEEE Trans. Pattern Anal. Mach. Intell. 44, 8717\u20138727 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2570_CR2","unstructured":"T. Afouras, J.S. Chung, A. Senior, O. Vinyals, A. Zisserman, Deep audio-visual speech recognition (2018). arXiv:1809.02108"},{"key":"2570_CR3","unstructured":"D. Andrade, D. Coimbra, S. Leo, M.L.D.S. Viana, C. Bernkopf, A neural attention model for speech command recognition (2018). arXiv:1808.08929"},{"key":"2570_CR4","doi-asserted-by":"publisher","first-page":"101131","DOI":"10.1016\/j.csl.2020.101131","volume":"65","author":"NT Anh","year":"2021","unstructured":"N.T. Anh, Y. Hu, Q. He, T.T.N. Linh, H.T.K. Dung, C. Guang, Lis-net: an end-to-end light interior search network for speech command recognition. Comput. Speech Lang. 65, 101131 (2021)","journal-title":"Comput. Speech Lang."},{"key":"2570_CR5","unstructured":"T. Audacity, Audacity. The name audacity (R) is a registered trademark of dominic mazzoni retrieved from http:\/\/audacity.sourceforge.net (2017)"},{"key":"2570_CR6","unstructured":"A. Canavan, D. Graff, G. Zipperlen. CALLHOME American English Speech LDC97S42 (Linguistic Data Consortium, Philadelphia, 1997)"},{"key":"2570_CR7","doi-asserted-by":"crossref","unstructured":"X. Chang, T. Maekaku, Y. Fujita, S. Watanabe, End-to-end integration of speech recognition, speech enhancement, and self-supervised learning representation (2022). arXiv:2204.00540","DOI":"10.21437\/Interspeech.2022-10839"},{"key":"2570_CR8","unstructured":"K. Choi, D. Joo, J. Kim, Kapre: on-gpu audio preprocessing layers for a quick implementation of deep neural network models with keras (2017). arXiv:1706.05781"},{"key":"2570_CR9","doi-asserted-by":"crossref","unstructured":"S. Choi, S. Seo, B. Shin, H. Byun, M. Kersner, B. Kim, D. Kim, S. Ha. Temporal convolution for real-time keyword spotting on mobile devices (2019). arXiv:1904.03814","DOI":"10.21437\/Interspeech.2019-1363"},{"key":"2570_CR10","unstructured":"J. Chorowski, D. Bahdanau, K. Cho, Y. Bengio, End-to-end continuous speech recognition using attention-based recurrent NN: First results (2014). arXiv:1412.1602"},{"key":"2570_CR11","unstructured":"J.K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, Y. Bengio, Attention-based models for speech recognition. Adv. Neural Inf. Process. Syst. 28 (2015)"},{"key":"2570_CR12","unstructured":"A. Dobashi, C.S. Leow, H. Nishizaki, Frequency-directional attention model for multilingual automatic speech recognition (2022). arXiv:2203.15473"},{"key":"2570_CR13","doi-asserted-by":"publisher","first-page":"198","DOI":"10.1109\/TASLP.2020.3039600","volume":"29","author":"C Fan","year":"2020","unstructured":"C. Fan, J. Yi, J. Tao, Z. Tian, B. Liu, Z. Wen, Gated recurrent fusion with joint training framework for robust end-to-end speech recognition. IEEE\/ACM Trans. Audio Speech Lang. Process. 29, 198\u2013209 (2020)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"2570_CR14","doi-asserted-by":"crossref","unstructured":"Y. Fujita, S. Watanabe, M. Omachi, X. Chan, Insertion-based modeling for end-to-end automatic speech recognition (2020). arXiv:2005.13211","DOI":"10.21437\/Interspeech.2020-1619"},{"key":"2570_CR15","volume-title":"TIMIT Acoustic-Phonetic Continuous Speech Corpus LDC93S1","author":"JS Garofolo","year":"1993","unstructured":"J.S. Garofolo et al., TIMIT Acoustic-Phonetic Continuous Speech Corpus LDC93S1 (Linguistic Data Consortium, Philadelphia, 1993)"},{"issue":"1","key":"2570_CR16","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s13636-021-00217-4","volume":"2021","author":"AL Georgescu","year":"2021","unstructured":"A.L. Georgescu, A. Pappalardo, H. Cucu, M. Blott, Performance vs. hardware requirements in state-of-the-art automatic speech recognition. EURASIP J. Audio Speech Music Process. 2021(1), 1\u201330 (2021)","journal-title":"EURASIP J. Audio Speech Music Process."},{"key":"2570_CR17","doi-asserted-by":"crossref","unstructured":"A. Graves, S. Fern\u00e1ndez, F. Gomez, J. Schmidhuber, Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In Proceedings of the 23rd International Conference on Machine Learning (2006, June), pp. 369\u2013376","DOI":"10.1145\/1143844.1143891"},{"key":"2570_CR18","doi-asserted-by":"crossref","unstructured":"A. Graves, Sequence transduction with recurrent neural networks (2012). arXiv:1211.3711","DOI":"10.1007\/978-3-642-24797-2"},{"key":"2570_CR19","doi-asserted-by":"crossref","unstructured":"A. Gulati, J. Qin, C.C. Chiu, N. Parmar, Y. Zhang, J. Yu, W. Han, S. Wang, Z. Zhang, Y. Wu, R. Pang. Conformer: Convolution-augmented transformer for speech recognition (2020). arXiv:2005.08100","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"2570_CR20","doi-asserted-by":"crossref","unstructured":"K.J. Han, R. Prieto, T. Ma, State-of-the-art speech recognition using multi-stream self-attention with dilated 1d convolutions. In 2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU) (IEEE, 2019, December), pp. 54\u201361","DOI":"10.1109\/ASRU46091.2019.9003730"},{"key":"2570_CR21","doi-asserted-by":"crossref","unstructured":"T. Hori, S. Watanabe, Y. Zhang, W. Chan, Advances in joint CTC-attention based end-to-end speech recognition with a deep CNN encoder and RNN-LM (2017). arXiv:1706.02737","DOI":"10.21437\/Interspeech.2017-1296"},{"issue":"4k","key":"2570_CR22","first-page":"10k","volume":"37","author":"W Hou","year":"2020","unstructured":"W. Hou, Y. Dong, B. Zhuang, L. Yang, J. Shi, T. Shinozaki, Large-scale end-to-end multilingual speech recognition and language identification with multi-task learning. Babel 37(4k), 10k (2020)","journal-title":"Babel"},{"key":"2570_CR23","doi-asserted-by":"publisher","first-page":"101272","DOI":"10.1016\/j.csl.2021.101272","volume":"71","author":"A Hussein","year":"2022","unstructured":"A. Hussein, S. Watanabe, A. Ali, Arabic speech recognition by end-to-end, modular systems and human. Comput. Speech Lang. 71, 101272 (2022)","journal-title":"Comput. Speech Lang."},{"key":"2570_CR24","doi-asserted-by":"crossref","unstructured":"S. Kim, T. Hori, S. Watanabe, Joint CTC-attention based end-to-end speech recognition using multi-task learning. In 2017 IEEE international conference on acoustics, speech and signal processing (ICASSP) (IEEE, 2017, March), pp. 4835\u20134839","DOI":"10.1109\/ICASSP.2017.7953075"},{"key":"2570_CR25","doi-asserted-by":"crossref","unstructured":"J. Li, Recent advances in end-to-end automatic speech recognition. APSIPA Trans. Signal Inf. Process. 11(1), (2022)","DOI":"10.1561\/116.00000050"},{"key":"2570_CR26","unstructured":"S. Liang, W. Yan, Multilingual speech recognition based on the end-to-end framework. Multimed. Tools Appl. (2022)"},{"key":"2570_CR27","doi-asserted-by":"crossref","unstructured":"L. Lu, X. Zhang, K. Cho, S. Renals, A study of the recurrent neural network encoder\u2013decoder for large vocabulary speech recognition. In Sixteenth Annual Conference of the International Speech Communication Association (2015)","DOI":"10.21437\/Interspeech.2015-654"},{"key":"2570_CR28","doi-asserted-by":"publisher","unstructured":"B. Paul, S. Phadikar, S. Bera, Indian regional spoken language identification using deep learning approach. In: Giri, D., Buyya, R., Ponnusamy, S., De, D., Adamatzky, A., Abawajy, J.H. (eds) Proceedings of the Sixth International Conference on Mathematics and Computing. Advances in Intelligent Systems and Computing, vol 1262. Springer, Singapore (2021) https:\/\/doi.org\/10.1007\/978-981-15-8061-1_21","DOI":"10.1007\/978-981-15-8061-1_21"},{"key":"2570_CR29","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-023-15598-1","author":"B Paul","year":"2023","unstructured":"B. Paul, S. Phadikar, A hybrid feature-extracted deep CNN with reduced parameters substitutes an End-to-End CNN for the recognition of spoken Bengali digits. Multimed. Tools Appl. (2023). https:\/\/doi.org\/10.1007\/s11042-023-15598-1","journal-title":"Multimed. Tools Appl."},{"key":"2570_CR30","doi-asserted-by":"crossref","unstructured":"D. Peter, W. Roth, F. Pernkopf, End-to-end keyword spotting using neural architecture search and quantization. In ICASSP 2022\u20132022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (IEEE, 2022, May), pp. 3423\u20133427","DOI":"10.1109\/ICASSP43922.2022.9746535"},{"key":"2570_CR31","doi-asserted-by":"crossref","unstructured":"S. Petridis, T. Stafylakis, P. Ma, G. Tzimiropoulos, M. Pantic, Audio-visual speech recognition with a hybrid ctc\/attention architecture. In 2018 IEEE Spoken Language Technology Workshop (SLT) (IEEE, 2018, December), pp. 513\u2013520","DOI":"10.1109\/SLT.2018.8639643"},{"key":"2570_CR32","doi-asserted-by":"crossref","unstructured":"V. Pratap, Q. Xu, J. Kahn, G. Avidov, T. Likhomanenko, A. Hannun, V. Liptchinsky, G. Synnaeve, R. Collobert. Scaling up online speech recognition using convnets (2020). arXiv:2001.09727","DOI":"10.21437\/Interspeech.2020-2840"},{"issue":"3","key":"2570_CR33","doi-asserted-by":"publisher","first-page":"1","DOI":"10.56345\/ijrdv9n301","volume":"9","author":"A Rista","year":"2022","unstructured":"A. Rista, A. Kadriu, A model for albanian speech recognition using end-to-end deep learning techniques. Interdiscip. J. Res. Dev. 9(3), 1\u20131 (2022)","journal-title":"Interdiscip. J. Res. Dev."},{"key":"2570_CR34","unstructured":"D.W. Romero, A. Kuzina, E.J. Bekkers, J.M. Tomczak, M. Hoogendoorn, Ckconv: Continuous kernel convolution for sequential data (2021). arXiv:2102.02611"},{"key":"2570_CR35","doi-asserted-by":"crossref","unstructured":"R. Vygon, N. Mikhaylovskiy, Learning efficient representations for keyword spotting with triplet loss. In International Conference on Speech and Computer (Springer, Cham, 2021, September), pp. 773\u2013785","DOI":"10.1007\/978-3-030-87802-3_69"},{"issue":"8","key":"2570_CR36","doi-asserted-by":"publisher","first-page":"1018","DOI":"10.3390\/sym11081018","volume":"11","author":"D Wang","year":"2019","unstructured":"D. Wang, X. Wang, S. Lv, An overview of end-to-end automatic speech recognition. Symmetry 11(8), 1018 (2019)","journal-title":"Symmetry"},{"key":"2570_CR37","unstructured":"P. Warden, Speech commands: a dataset for limited-vocabulary speech recognition (2018). arXiv:1804.03209"},{"issue":"8","key":"2570_CR38","doi-asserted-by":"publisher","first-page":"1240","DOI":"10.1109\/JSTSP.2017.2763455","volume":"11","author":"S Watanabe","year":"2017","unstructured":"S. Watanabe, T. Hori, S. Kim, J.R. Hershey, T. Hayashi, Hybrid CTC\/attention architecture for end-to-end speech recognition. IEEE J. Sel. Top. Signal Process. 11(8), 1240\u20131253 (2017)","journal-title":"IEEE J. Sel. Top. Signal Process."},{"issue":"3","key":"2570_CR39","doi-asserted-by":"publisher","first-page":"1525","DOI":"10.1007\/s12652-021-03022-1","volume":"13","author":"Y Wei","year":"2022","unstructured":"Y. Wei, Z. Gong, S. Yang, K. Ye, Y. Wen, EdgeCRNN: an edge-computing oriented model of acoustic feature enhancement for keyword spotting. J. Ambient. Intell. Humaniz. Comput. 13(3), 1525\u20131535 (2022)","journal-title":"J. Ambient. Intell. Humaniz. Comput."},{"key":"2570_CR40","doi-asserted-by":"publisher","first-page":"788","DOI":"10.1109\/LSP.2021.3071668","volume":"28","author":"C Yi","year":"2021","unstructured":"C. Yi, S. Zhou, B. Xu, Efficiently fusing pretrained acoustic and linguistic encoders for low-resource speech recognition. IEEE Signal Process. Lett. 28, 788\u2013792 (2021)","journal-title":"IEEE Signal Process. Lett."},{"issue":"2","key":"2570_CR41","doi-asserted-by":"publisher","first-page":"214","DOI":"10.3390\/sym13020214","volume":"13","author":"N Zacarias-Morales","year":"2021","unstructured":"N. Zacarias-Morales, P. Pancardo, J.A. Hern\u00e1ndez-Nolasco, M. Garcia-Constantino, Attention-inspired artificial neural networks for speech processing: a systematic review. Symmetry 13(2), 214 (2021)","journal-title":"Symmetry"},{"key":"2570_CR42","doi-asserted-by":"crossref","unstructured":"A. Zeyer, K. Irie, R. Schl\u00fcter, H. Ney, Improved training of end-to-end attention models for speech recognition (2018). arXiv:1805.03294","DOI":"10.21437\/Interspeech.2018-1616"},{"key":"2570_CR43","doi-asserted-by":"crossref","unstructured":"S. Zhang, E. Loweimi, P. Bell, S. Renals, On the usefulness of self-attention for automatic speech recognition with transformers. In 2021 IEEE Spoken Language Technology Workshop (SLT) (IEEE, 2021, January), pp. 89\u201396","DOI":"10.1109\/SLT48900.2021.9383521"},{"key":"2570_CR44","doi-asserted-by":"crossref","unstructured":"Y. Zhang, D.S. Park, W. Han, J. Qin, A. Gulati, J. Shor, Y. Wu et al., Bigssl: Exploring the frontier of large-scale semi-supervised learning for automatic speech recognition. IEEE J. Sel. Top. Signal Process. (2022)","DOI":"10.1109\/JSTSP.2022.3182537"}],"container-title":["Circuits, Systems, and Signal Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00034-023-02570-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00034-023-02570-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00034-023-02570-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,2]],"date-time":"2024-03-02T15:14:24Z","timestamp":1709392464000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00034-023-02570-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,24]]},"references-count":44,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2024,4]]}},"alternative-id":["2570"],"URL":"https:\/\/doi.org\/10.1007\/s00034-023-02570-5","relation":{},"ISSN":["0278-081X","1531-5878"],"issn-type":[{"value":"0278-081X","type":"print"},{"value":"1531-5878","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,12,24]]},"assertion":[{"value":"22 June 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 November 2023","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 November 2023","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 December 2023","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}}]}}