{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T18:58:11Z","timestamp":1757617091393,"version":"3.44.0"},"reference-count":38,"publisher":"Springer Science and Business Media LLC","issue":"22","license":[{"start":{"date-parts":[[2024,9,2]],"date-time":"2024-09-02T00:00:00Z","timestamp":1725235200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,2]],"date-time":"2024-09-02T00:00:00Z","timestamp":1725235200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100004608","name":"Natural Science Foundation of Jiangsu Province","doi-asserted-by":"publisher","award":["BK20190579"],"award-info":[{"award-number":["BK20190579"]}],"id":[{"id":"10.13039\/501100004608","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61902154"],"award-info":[{"award-number":["61902154"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-024-20129-7","type":"journal-article","created":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T20:01:41Z","timestamp":1725220901000},"page":"25661-25676","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Investigation of attention mechanism for speech command recognition"],"prefix":"10.1007","volume":"84","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7897-9584","authenticated-orcid":false,"given":"Jie","family":"Xie","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mingying","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kai","family":"Hu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jinglan","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ya","family":"Guo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,9,2]]},"reference":[{"key":"20129_CR1","doi-asserted-by":"crossref","unstructured":"Abeje BT, Salau AO, Ebabu HA, Ayalew AM (2022) Comparative analysis of deep learning models for aspect level amharic news sentiment analysis. In: 2022 International conference on decision aid sciences and applications (DASA), pp 1628\u20131633. IEEE","DOI":"10.1109\/DASA54658.2022.9765172"},{"key":"20129_CR2","doi-asserted-by":"publisher","first-page":"48","DOI":"10.1016\/j.jnca.2017.08.017","volume":"97","author":"M Alaa","year":"2017","unstructured":"Alaa M, Zaidan AA, Zaidan BB, Talal M, Kiah MLM (2017) A review of smart home applications based on internet of things. J Netw Comput Appl 97:48\u201365","journal-title":"J Netw Comput Appl"},{"key":"20129_CR3","doi-asserted-by":"crossref","unstructured":"Bae J, Kim D-S (2018) End-to-end speech command recognition with capsule network. In: Interspeech, pp 776\u2013780","DOI":"10.21437\/Interspeech.2018-1888"},{"key":"20129_CR4","doi-asserted-by":"crossref","unstructured":"Bai X, Du J, Pan J, Zhou H-s, Tu Y-H, Lee C-H (2020) High-resolution attention network with acoustic segment model for acoustic scene classification. In ICASSP 2020-2020 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 656\u2013660. IEEE","DOI":"10.1109\/ICASSP40776.2020.9053519"},{"key":"20129_CR5","doi-asserted-by":"crossref","unstructured":"Berdibayeva GK, Spirkin AN, Bodin ON, Bezborodova OE (2021) Features of speech commands recognition using an artificial neural network. In: 2021 Ural symposium on biomedical engineering, radioelectronics and information technology (USBEREIT), pp 0157\u20130160. IEEE","DOI":"10.1109\/USBEREIT51232.2021.9455111"},{"issue":"1","key":"20129_CR6","doi-asserted-by":"publisher","first-page":"66","DOI":"10.1186\/s40537-022-00619-x","volume":"9","author":"WB Demilie","year":"2022","unstructured":"Demilie WB, Salau AO (2022) Detection of fake news and hate speech for ethiopian languages: a systematic review of the approaches. J Big Data 9(1):66","journal-title":"J Big Data"},{"key":"20129_CR7","doi-asserted-by":"crossref","unstructured":"Ding X, Zhang X, Ma N, Han J, Ding G, Sun J (2021) Repvgg: Making vgg-style convnets great again. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 13733\u201313742","DOI":"10.1109\/CVPR46437.2021.01352"},{"key":"20129_CR8","doi-asserted-by":"crossref","unstructured":"Eyben F, W\u00f6llmer M, Schuller B (2010) Opensmile: the munich versatile and fast open-source audio feature extractor. In: Proceedings of the 18th ACM international conference on Multimedia, pp 1459\u20131462","DOI":"10.1145\/1873951.1874246"},{"key":"20129_CR9","doi-asserted-by":"crossref","unstructured":"Guiming D, Xia W, Guangyan W, Yan Z, Dan L (2016) Speech recognition based on convolutional neural networks. In 2016 IEEE international conference on signal and image processing (ICSIP), pp 708\u2013711. IEEE","DOI":"10.1109\/SIPROCESS.2016.7888355"},{"key":"20129_CR10","doi-asserted-by":"crossref","unstructured":"Hinrichs R, Dunkel J, Ostermann J (2021) Mixing time-frequency distributions for speech command recognition using convolutional neural networks. In: 2021 6th International conference on frontiers of signal processing (ICFSP), pp 6\u201311. IEEE","DOI":"10.1109\/ICFSP53514.2021.9646416"},{"issue":"2","key":"20129_CR11","doi-asserted-by":"publisher","first-page":"13","DOI":"10.1016\/j.ifacol.2017.12.003","volume":"50","author":"Z Horn","year":"2017","unstructured":"Horn Z, Auret L, McCoy J, Aldrich C, Herbst B (2017) Performance of convolutional neural networks for feature extraction in froth flotation sensing. IFAC-PapersOnLine 50(2):13\u201318","journal-title":"IFAC-PapersOnLine"},{"key":"20129_CR12","doi-asserted-by":"crossref","unstructured":"Huang Z, Wang X, Huang L, Huang C, Wei Y, Liu W (2019) Ccnet: Criss-cross attention for semantic segmentation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 603\u2013612","DOI":"10.1109\/ICCV.2019.00069"},{"key":"20129_CR13","unstructured":"Huzaifah M (2017) Comparison of time-frequency representations for environmental sound classification using convolutional neural networks. arXiv:1706.07156"},{"issue":"3","key":"20129_CR14","first-page":"6","volume":"13","author":"MR Kamarudin","year":"2013","unstructured":"Kamarudin MR, Yusof M, Jaya HT (2013) Low cost smart home automation via microsoft speech recognition. Int J Eng Comput Sci 13(3):6\u201311","journal-title":"Int J Eng Comput Sci"},{"key":"20129_CR15","doi-asserted-by":"publisher","first-page":"105933","DOI":"10.1016\/j.asoc.2019.105933","volume":"86","author":"R Karthik","year":"2020","unstructured":"Karthik R, Hariharan M, Anand S, Mathikshara P, Johnson A, Menaka R (2020) Attention embedded residual cnn for disease detection in tomato leaves. Appl Soft Comput 86:105933","journal-title":"Appl Soft Comput"},{"issue":"2","key":"20129_CR16","doi-asserted-by":"publisher","first-page":"413","DOI":"10.1109\/TMI.2019.2927226","volume":"39","author":"L Li","year":"2019","unstructured":"Li L, Xu M, Liu H, Li Y, Wang X, Jiang L, Wang Z, Fan X, Wang N (2019) A large-scale database and a cnn model for attention-based glaucoma detection. IEEE Trans Med Imaging 39(2):413\u2013424","journal-title":"IEEE Trans Med Imaging"},{"key":"20129_CR17","doi-asserted-by":"publisher","first-page":"189","DOI":"10.1002\/9781118342015.ch7","volume":"1","author":"P Lindberg","year":"2012","unstructured":"Lindberg P, Leingang J, Lysaker D, Bilal K, Khan SU, Bouvry P, Ghani N, Min-Allah N, Li J (2012) Comparison and analysis of greedy energy-efficient scheduling algorithms for computational grids. Energy-efficient Distrib Comput Syst 1:189\u2013214","journal-title":"Energy-efficient Distrib Comput Syst"},{"issue":"4","key":"20129_CR18","doi-asserted-by":"publisher","first-page":"491","DOI":"10.1109\/TKDE.2005.66","volume":"17","author":"H Liu","year":"2005","unstructured":"Liu H, Yu L (2005) Toward integrating feature selection algorithms for classification and clustering. IEEE Trans knowl Data Eng 17(4):491\u2013502","journal-title":"IEEE Trans knowl Data Eng"},{"key":"20129_CR19","doi-asserted-by":"crossref","unstructured":"Lv X, Zhang M, Li H (2008) Robot control based on voice command. In: 2008 IEEE International Conference on Automation and Logistics, pp 2490\u20132494. IEEE","DOI":"10.1109\/ICAL.2008.4636587"},{"key":"20129_CR20","doi-asserted-by":"publisher","first-page":"737","DOI":"10.1007\/s10898-017-0602-1","volume":"70","author":"Y Lyu","year":"2018","unstructured":"Lyu Y, Chen L, Zhang C, Qu D, Min-Allah N, Wang Y (2018) An interleaved depth-first search method for the linear optimization problem with disjunctive constraints. J Global Optim 70:737\u2013756","journal-title":"J Global Optim"},{"issue":"3","key":"20129_CR21","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1016\/0098-3004(93)90090-R","volume":"19","author":"A Ma\u0107kiewicz","year":"1993","unstructured":"Ma\u0107kiewicz A, Ratajczak W (1993) Principal components analysis (pca). Comput Geosci 19(3):303\u2013342","journal-title":"Comput Geosci"},{"key":"20129_CR22","doi-asserted-by":"crossref","unstructured":"Majumdar S, Ginsburg B (2020) Matchboxnet\u20131d time-channel separable convolutional neural network architecture for speech commands recognition. arXiv:2004.08531","DOI":"10.21437\/Interspeech.2020-1058"},{"issue":"25","key":"20129_CR23","doi-asserted-by":"publisher","first-page":"38667","DOI":"10.1007\/s11042-023-15118-1","volume":"82","author":"S Mehra","year":"2023","unstructured":"Mehra S, Susan S (2023) Deep fusion framework for speech command recognition using acoustic and linguistic features. Multimed Tools Appl 82(25):38667\u201338691","journal-title":"Multimed Tools Appl"},{"key":"20129_CR24","doi-asserted-by":"publisher","first-page":"101523","DOI":"10.1016\/j.scs.2019.101523","volume":"48","author":"N Min-Allah","year":"2019","unstructured":"Min-Allah N, Qureshi MB, Alrashed S, Rana OF (2019) Cost efficient resource allocation for real-time tasks in embedded systems. Sustain Cities Soc 48:101523","journal-title":"Sustain Cities Soc"},{"key":"20129_CR25","doi-asserted-by":"publisher","first-page":"19143","DOI":"10.1109\/ACCESS.2019.2896880","volume":"7","author":"AB Nassif","year":"2019","unstructured":"Nassif AB, Shahin I, Attili I, Azzeh M, Shaalan K (2019) Speech recognition using deep neural networks: A systematic review. IEEE Access 7:19143\u201319165","journal-title":"IEEE Access"},{"issue":"2","key":"20129_CR26","first-page":"377","volume":"11","author":"SK Nayak","year":"2023","unstructured":"Nayak SK, Nayak AK, Mishra S, Mohanty P (2023) Deep learning approaches for speech command recognition in a low resource kui language. Int J Intell Syst Appl Eng 11(2):377\u2013386","journal-title":"Int J Intell Syst Appl Eng"},{"key":"20129_CR27","doi-asserted-by":"crossref","unstructured":"Nguyen QH, Cao T-D (2020) A novel method for recognizing vietnamese voice commands on smartphones with support vector machine and convolutional neural networks. Wirel Commun Mob Comput 2020","DOI":"10.1155\/2020\/2312908"},{"key":"20129_CR28","doi-asserted-by":"publisher","first-page":"102089","DOI":"10.1016\/j.jairtraman.2021.102089","volume":"95","author":"O Ohneiser","year":"2021","unstructured":"Ohneiser O, Helmke H, Shetty S, Kleinert M, Ehr H, Murauskas \u0160, Pagirys T (2021) Prediction and extraction of tower controller commands for speech recognition applications. J Air Trans Manag 95:102089","journal-title":"J Air Trans Manag"},{"key":"20129_CR29","doi-asserted-by":"crossref","unstructured":"Phan H, Ch\u00e9n OY, Pham L, Koch P, De\u00a0Vos M, McLoughlin I, Mertins A (2019a) Spatio-temporal attention pooling for audio scene classification. arXiv:1904.03543","DOI":"10.21437\/Interspeech.2019-3040"},{"key":"20129_CR30","doi-asserted-by":"crossref","unstructured":"Phan P, Giang TM, Nam L, et\u00a0al (2019b) Vietnamese speech command recognition using recurrent neural networks. IJACSA) Int J Adv Comput Sci Appl 10(7)","DOI":"10.14569\/IJACSA.2019.0100728"},{"key":"20129_CR31","doi-asserted-by":"crossref","unstructured":"Qi J, Tejedor J (2022) Classical-to-quantum transfer learning for spoken command recognition based on quantum neural networks. In: ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp 8627\u20138631. IEEE","DOI":"10.1109\/ICASSP43922.2022.9747636"},{"key":"20129_CR32","doi-asserted-by":"crossref","unstructured":"Salau AO, Jain S (2019) Feature extraction: a survey of the types, techniques, applications. In: 2019 International conference on signal processing and communication (ICSC), pp 158\u2013164. IEEE","DOI":"10.1109\/ICSC45622.2019.8938371"},{"key":"20129_CR33","doi-asserted-by":"crossref","unstructured":"Solovyev RA, Vakhrushev M, Radionov A, Romanova II, Amerikanov AA, Aliev V, Shvets AA (2020) Deep learning approaches for understanding simple speech commands. In: 2020 IEEE 40th International conference on electronics and nanotechnology (ELNANO), pp 688\u2013693. IEEE","DOI":"10.1109\/ELNANO50318.2020.9088863"},{"key":"20129_CR34","doi-asserted-by":"crossref","unstructured":"Tombeng MT, Najoan R, Karel N (2018) Smart car: Digital controlling system using android smartwatch voice recognition. In: 2018 6th International conference on cyber and IT service management (CITSM), pp 1\u20135. IEEE","DOI":"10.1109\/CITSM.2018.8674359"},{"key":"20129_CR35","doi-asserted-by":"crossref","unstructured":"Wang H, Zou Y, Chong D, Wang W (2019) Environmental sound classification with parallel temporal-spectral attention. arXiv:1912.06808","DOI":"10.21437\/Interspeech.2020-1219"},{"key":"20129_CR36","doi-asserted-by":"publisher","first-page":"175353","DOI":"10.1109\/ACCESS.2019.2957572","volume":"7","author":"J Xie","year":"2019","unstructured":"Xie J, Hu K, Zhu M, Yu J, Zhu Q (2019) Investigation of different cnn-based models for improved bird sound classification. IEEE Access 7:175353\u2013175361","journal-title":"IEEE Access"},{"issue":"5","key":"20129_CR37","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3178115","volume":"9","author":"Z Zhang","year":"2018","unstructured":"Zhang Z, Geiger J, Pohjalainen J, Mousa AE-D, Jin W, Schuller B (2018) Deep learning for environmentally robust speech recognition: An overview of recent developments. ACM Trans Intell Syst Technol (TIST) 9(5):1\u201328","journal-title":"ACM Trans Intell Syst Technol (TIST)"},{"key":"20129_CR38","doi-asserted-by":"crossref","unstructured":"Zhang Z, Qin R, Li G, Du Z, Wen G, He W (2022) A novel approach for surface integrity monitoring in high-energy nanosecond-pulse laser shock peening: Acoustic emission and hybrid-attention cnn. IEEE Trans Ind Inf 1\u20131","DOI":"10.1109\/TII.2022.3157641"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-20129-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-024-20129-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-20129-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,5]],"date-time":"2025-09-05T21:45:28Z","timestamp":1757108728000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-024-20129-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,2]]},"references-count":38,"journal-issue":{"issue":"22","published-online":{"date-parts":[[2025,7]]}},"alternative-id":["20129"],"URL":"https:\/\/doi.org\/10.1007\/s11042-024-20129-7","relation":{},"ISSN":["1573-7721"],"issn-type":[{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2024,9,2]]},"assertion":[{"value":"29 November 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 June 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 August 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 September 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of interest"}}]}}