{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,30]],"date-time":"2025-11-30T04:09:45Z","timestamp":1764475785201,"version":"3.46.0"},"reference-count":69,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2025,7,28]],"date-time":"2025-07-28T00:00:00Z","timestamp":1753660800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,7,28]],"date-time":"2025-07-28T00:00:00Z","timestamp":1753660800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Circuits Syst Signal Process"],"published-print":{"date-parts":[[2025,12]]},"DOI":"10.1007\/s00034-025-03231-5","type":"journal-article","created":{"date-parts":[[2025,7,28]],"date-time":"2025-07-28T17:57:59Z","timestamp":1753725479000},"page":"9336-9361","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Evaluating Suprasegmental Features for Phonological Fusion and Spectrogram-Based Speech Command Recognition"],"prefix":"10.1007","volume":"44","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6397-6049","authenticated-orcid":false,"given":"Sunakshi","family":"Mehra","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2046-8642","authenticated-orcid":false,"given":"Virender","family":"Ranga","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0420-1892","authenticated-orcid":false,"given":"Ritu","family":"Agarwal","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,7,28]]},"reference":[{"issue":"1","key":"3231_CR1","doi-asserted-by":"publisher","first-page":"27","DOI":"10.21608\/ejle.2020.47685.1015","volume":"8","author":"ER Abdelmaksoud","year":"2021","unstructured":"E.R. Abdelmaksoud, A. Hassen, N. Hassan, M. Hesham, Convolutional neural network for Arabic speech recognition. Egypt J Language Eng 8(1), 27\u201338 (2021)","journal-title":"Egypt J Language Eng"},{"key":"3231_CR2","volume-title":"Arabic phonetics","author":"M Alghamdi","year":"2001","unstructured":"M. Alghamdi, Arabic phonetics (Al-Toubah Bookshop, Riyadh, 2001)"},{"issue":"2","key":"3231_CR3","doi-asserted-by":"publisher","first-page":"343","DOI":"10.1016\/j.ipm.2017.07.002","volume":"56","author":"E Alsharhan","year":"2019","unstructured":"E. Alsharhan, A. Ramsay, Improved Arabic speech recognition system through the automatic generation of fine-grained phonetic transcriptions. Inf. Process. Manag. 56(2), 343\u2013353 (2019). https:\/\/doi.org\/10.1016\/j.ipm.2017.07.002","journal-title":"Inf. Process. Manag."},{"key":"3231_CR4","doi-asserted-by":"publisher","first-page":"975","DOI":"10.1007\/s10579-020-09505-5","volume":"54","author":"E Alsharhan","year":"2020","unstructured":"E. Alsharhan, A. Ramsay, Investigating the effects of gender, dialect, and training size on the performance of Arabic speech recognition. Lang. Resour. Eval. 54, 975\u2013998 (2020). https:\/\/doi.org\/10.1007\/s10579-020-09505-5","journal-title":"Lang. Resour. Eval."},{"key":"3231_CR5","doi-asserted-by":"publisher","first-page":"42","DOI":"10.1016\/j.specom.2016.11.004","volume":"86","author":"M Alsulaiman","year":"2017","unstructured":"M. Alsulaiman, A. Mahmood, G. Muhammad, Speaker recognition based on Arabic phonemes. Speech Commun. 86, 42\u201351 (2017). https:\/\/doi.org\/10.1016\/j.specom.2016.11.004","journal-title":"Speech Commun."},{"issue":"1","key":"3231_CR6","first-page":"138","volume":"3","author":"G Begu\u0161","year":"2020","unstructured":"G. Begu\u0161, Modeling unsupervised phonetic and phonological learning in generative adversarial phonology. Proc Soc Comput Linguist 3(1), 138\u2013148 (2020)","journal-title":"Proc Soc Comput Linguist"},{"key":"3231_CR7","doi-asserted-by":"publisher","unstructured":"A. Berg, M. O\u2018Connor, & M.T. Cruz, Keyword transformer: a self-attention model for keyword spotting. arXiv preprint arXiv:2104.00769 (2021). https:\/\/doi.org\/10.21437\/Interspeech.2021-1286","DOI":"10.21437\/Interspeech.2021-1286"},{"key":"3231_CR8","unstructured":"Bisol, Leda, ed.\u00a0Introdu\u00e7\u00e3o a estudos de fonologia do portugu\u00eas brasileiro. EdiPUCRS, 2005"},{"key":"3231_CR9","doi-asserted-by":"publisher","unstructured":"M.V. Borges, Fon\u00e9tica e fonologia do portugu\u00eas: roteiro de estudos e guia de exerc\u00edcios. 183\u2013188 (2000). https:\/\/doi.org\/10.1590\/S0102-44502000000100011","DOI":"10.1590\/S0102-44502000000100011"},{"key":"3231_CR10","doi-asserted-by":"crossref","unstructured":"L. Cances, and T. Pellegrini. Comparison of deep co-training and mean-teacher approaches for semi-supervised audio tagging. In ICASSP 2021\u20132021 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp. 361\u2013365. IEEE, 2021. https:\/\/hal.science\/hal-03170277","DOI":"10.1109\/ICASSP39728.2021.9415116"},{"key":"3231_CR11","doi-asserted-by":"publisher","unstructured":"D. Cer, Y. Yang, S.Y. Kong, N. Hua, N. Limtiaco, R.S. John, N. Constant, M. Guajardo-Cespedes, S. Yuan, C. Tar, Y.H. Sung et al, Universal sentence encoder. arXiv preprint arXiv:1803.11175 (2018). https:\/\/doi.org\/10.48550\/arXiv.1803.11175","DOI":"10.48550\/arXiv.1803.11175"},{"key":"3231_CR12","doi-asserted-by":"publisher","unstructured":"K. Chen, X. Du, B. Zhu, Z. Ma, Taylor Berg-Kirkpatrick, and Shlomo Dubnov. HTS-AT: A hierarchical token-semantic audio transformer for sound classification and detection. In\u00a0ICASSP 2022\u20132022 ieee international conference on acoustics, speech and signal processing (ICASSP), pp. 646\u2013650. IEEE (2022). https:\/\/doi.org\/10.48550\/arXiv.2202.00874","DOI":"10.48550\/arXiv.2202.00874"},{"issue":"4","key":"3231_CR13","doi-asserted-by":"publisher","first-page":"135","DOI":"10.3109\/03091908209041006","volume":"6","author":"RI Damper","year":"1982","unstructured":"R.I. Damper, Speech technology\u2014implications for biomedical engineering. J. Med. Eng. Technol. 6(4), 135\u2013149 (1982)","journal-title":"J. Med. Eng. Technol."},{"key":"3231_CR14","doi-asserted-by":"publisher","unstructured":"D.C. De Andrade, S. Leo, M.L. Viana, C. Bernkopf, A neural attention model for speech command recognition. arXiv preprint arXiv:1808.08929\u00a0(2018). https:\/\/doi.org\/10.48550\/arXiv.1808.08929.","DOI":"10.48550\/arXiv.1808.08929"},{"key":"3231_CR15","unstructured":"J.R. Deller Jr, J.G. Proakis, and J.H.L. Hansen, Discrete-time processing of speech signals. MacMillan. New York (1993)."},{"key":"3231_CR16","unstructured":"J. Duchi, E. Hazan, and Y. Singer, Adaptive subgradient methods for online learning and stochastic optimization. J. Mach. Learn. Res. 12(7) (2011)."},{"key":"3231_CR17","doi-asserted-by":"publisher","unstructured":"A. Gazneli, G. Zimerman, T. Ridnik, G. Sharir, and A. Noy, End-to-end audio strikes back: Boosting augmentations towards an efficient audio classification network. arXiv preprint (2022).  https:\/\/doi.org\/10.48550\/arXiv.2204.11479","DOI":"10.48550\/arXiv.2204.11479"},{"key":"3231_CR18","doi-asserted-by":"publisher","unstructured":"K. Glocker, A. Herygers, and M. Georges, Allophant: cross-lingual phoneme recognition with articulatory attributes. \u00a0(2023). arXiv preprint arXiv:2306.04306https:\/\/doi.org\/10.21437\/Interspeech.2023-772.","DOI":"10.21437\/Interspeech.2023-772"},{"key":"3231_CR19","doi-asserted-by":"crossref","unstructured":"Y. Gong, Y.-A. Chung, and J. Glass, Ast: audio spectrogram transformer. arXiv preprint arXiv:2104.01778 (2021).","DOI":"10.21437\/Interspeech.2021-698"},{"key":"3231_CR20","unstructured":"R.C. Gonzalez, and R.E. Woods, Digital image processing, prentice hall. Upper Saddle River, NJ (2008)."},{"key":"3231_CR21","doi-asserted-by":"publisher","unstructured":"MA. Haque, A. Verma, J.S.R. Alex, and N. Venkatesan, Experimental evaluation of CNN architecture for speech recognition. In\u00a0First International Conference on Sustainable Technologies for Computational Intelligence: Proceedings of ICTSCI 2019, pp. 507\u2013514. Springer Singapore, 2020. https:\/\/doi.org\/10.1007\/978-981-15-0029-9_40.","DOI":"10.1007\/978-981-15-0029-9_40"},{"key":"3231_CR22","doi-asserted-by":"publisher","unstructured":"T.J. Hazen, Automatic alignment and error correction of human generated transcripts for long speech recordings. In 9th international conference on spoken language processing. (2006). https:\/\/doi.org\/10.21437\/Interspeech.2006-449.","DOI":"10.21437\/Interspeech.2006-449"},{"key":"3231_CR23","doi-asserted-by":"publisher","first-page":"199","DOI":"10.1007\/s12530-011-9034-1","volume":"2","author":"T Herbig","year":"2011","unstructured":"T. Herbig, F. Gerl, W. Minker, R. Haeb-Umbach, Adaptive systems for unsupervised speaker tracking and speech recognition. Evol. Syst. 2, 199\u2013214 (2011). https:\/\/doi.org\/10.1007\/s12530-011-9034-1","journal-title":"Evol. Syst."},{"key":"3231_CR24","doi-asserted-by":"publisher","unstructured":"B. Higy, and B. Peter, Few-shot learning with attention-based sequence-to-sequence models. arXiv preprint arXiv:1811.03519\u00a0(2018). https:\/\/doi.org\/10.48550\/arXiv.1811.03519.","DOI":"10.48550\/arXiv.1811.03519"},{"key":"3231_CR25","doi-asserted-by":"publisher","unstructured":"B. Kim, S. Chang, J. Lee, and D. Sung, Broadcasted residual learning for efficient keyword spotting. arXiv preprint (2021). https:\/\/doi.org\/10.48550\/arXiv.2106.04140","DOI":"10.48550\/arXiv.2106.04140"},{"key":"3231_CR26","unstructured":"K. Kirchhoff, J. Bilmes, J. Henderson, R. Schwartz, M. Noamany, P. Schone, G. Ji et al, Novel speech recognition models for Arabic. In Johns-Hopkins University summer research workshop. 2002. https:\/\/people.ece.uw.edu\/bilmes\/p\/pgs\/index.html."},{"issue":"3","key":"3231_CR27","doi-asserted-by":"publisher","first-page":"501","DOI":"10.1007\/s12530-022-09473-x","volume":"14","author":"YV Koteswararao","year":"2023","unstructured":"Y.V. Koteswararao, C.R. Rao, Multichannel KHMF for speech separation with enthalpy based DOA and score based CNN (SCNN). Evolv. Syst. 14(3), 501\u2013518 (2023). https:\/\/doi.org\/10.1007\/s12530-022-09473-x","journal-title":"Evolv. Syst."},{"key":"3231_CR28","unstructured":"P. Ladefoged, and K. Johnson, A course in phonetics. (Cengage learning, 2014)."},{"key":"3231_CR29","doi-asserted-by":"publisher","unstructured":"I. Lezhenin, N. Bogach, E. Pyshkin, Urban sound classification using long short-term memory neural network. In 2019 federated conference on computer science and information systems (FedCSIS), 57\u201360 (2019) https:\/\/doi.org\/10.15439\/2019F185","DOI":"10.15439\/2019F185"},{"key":"3231_CR30","doi-asserted-by":"publisher","unstructured":"B. Li, J.Y. Xie, and F. Rudzicz, Representation learning for discovering phonemic tone contours. arXiv preprint arXiv:1910.08987\u00a0(2019). https:\/\/doi.org\/10.48550\/arXiv.1910.08987","DOI":"10.48550\/arXiv.1910.08987"},{"key":"3231_CR31","doi-asserted-by":"publisher","unstructured":"X. Li, S. Dalmia, J. Li, M. Lee, P. Littell, J. Yao, A. Anastasopoulos et al., Universal phone recognition with a multilingual allophone system. In ICASSP 2020\u20132020 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp. 8249\u20138253. IEEE (2020). https:\/\/doi.org\/10.48550\/arXiv.2002.11800","DOI":"10.48550\/arXiv.2002.11800"},{"key":"3231_CR32","doi-asserted-by":"publisher","unstructured":"J. Lin, K. Kilgour, D. Roblek, and M. Sharifi, Training keyword spotters with limited and synthesized speech data. In ICASSP 2020\u20132020 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp. 7474\u20137478. IEEE, 2020. https:\/\/doi.org\/10.48550\/arXiv.2002.01322","DOI":"10.48550\/arXiv.2002.01322"},{"key":"3231_CR33","doi-asserted-by":"publisher","unstructured":"Z. Liu, Y. Lin, Y Cao, H. Hu, Y. Wei, Z. Zhang, S. Lin, B. Guo, Swin transformer: Hierarchical vision transformer using shifted windows. In proceedings of the IEEE\/CVF international conference on computer vision, pp. 10012\u201310022 (2021). https:\/\/doi.org\/10.48550\/arXiv.2103.14030","DOI":"10.48550\/arXiv.2103.14030"},{"key":"3231_CR34","doi-asserted-by":"publisher","unstructured":"S. Majumdar, and B. Ginsburg. Matchboxnet: 1d time-channel separable convolutional neural network architecture for speech commands recognition. arXiv preprint (2020). https:\/\/doi.org\/10.21437\/Interspeech.2020-1058","DOI":"10.21437\/Interspeech.2020-1058"},{"key":"3231_CR35","doi-asserted-by":"publisher","first-page":"85","DOI":"10.1007\/s12530-016-9156-6","volume":"8","author":"M Malcangi","year":"2017","unstructured":"M. Malcangi, P. Grew, Evolving connectionist method for adaptive audiovisual speech recognition. Evol. Syst. 8, 85\u201394 (2017). https:\/\/doi.org\/10.1007\/s12530-016-9156-6","journal-title":"Evol. Syst."},{"key":"3231_CR36","doi-asserted-by":"publisher","unstructured":"S. Mehra, S. Susan, Early fusion of phone embeddings for recognition of low-resourced accented speech, in Proceedings of the 2022 4th International Conference on Artificial Intelligence and Speech Technology (AIST), IEEE, pp. 1\u20135, Dec. 2022. https:\/\/doi.org\/10.1109\/AIST55798.2022.10064735","DOI":"10.1109\/AIST55798.2022.10064735"},{"issue":"2","key":"3231_CR37","doi-asserted-by":"publisher","first-page":"1365","DOI":"10.1007\/s11760-023-02845-z","volume":"18","author":"S Mehra","year":"2024","unstructured":"S. Mehra, V. Ranga, R. Agarwal, Improving speech command recognition through decision-level fusion of deep filtered speech cues. SIViP 18(2), 1365\u20131373 (2024). https:\/\/doi.org\/10.1007\/s11760-023-02845-z","journal-title":"SIViP"},{"issue":"6","key":"3231_CR38","doi-asserted-by":"publisher","DOI":"10.1111\/coin.70012","volume":"40","author":"S Mehra","year":"2024","unstructured":"S. Mehra, V. Ranga, R. Agarwal, Multimodal integration of Mel spectrograms and text transcripts for enhanced automatic speech recognition: leveraging extractive transformer-based approaches and late fusion strategies. Comput. Intell. 40(6), e70012 (2024). https:\/\/doi.org\/10.1111\/coin.70012","journal-title":"Comput. Intell."},{"issue":"3","key":"3231_CR39","doi-asserted-by":"publisher","first-page":"2020","DOI":"10.1007\/s00034-024-02915-8","volume":"44","author":"S Mehra","year":"2025","unstructured":"S. Mehra, V. Ranga, R. Agarwal, Dhivehi speech recognition: a multimodal approach for dhivehi language in resource-constrained settings. Circuits Syst. Signal Process. 44(3), 2020\u20132040 (2025)","journal-title":"Circuits Syst. Signal Process."},{"key":"3231_CR40","doi-asserted-by":"publisher","unstructured":"M.M. Morshed, and A.O. Ahsan, Attention-free keyword spotting. arXiv preprint arXiv:2110.07749 (2021). https:\/\/doi.org\/10.48550\/arXiv.2110.07749","DOI":"10.48550\/arXiv.2110.07749"},{"key":"3231_CR41","first-page":"65","volume":"100","author":"D Newman","year":"2002","unstructured":"D. Newman, The phonetic status of Arabic within the world\u2019s languages: the uniqueness of the lughat al-daad. Antwerp Papers Linguist. 100, 65\u201375 (2002)","journal-title":"Antwerp Papers Linguist."},{"key":"3231_CR42","doi-asserted-by":"publisher","unstructured":"D. Niizumi, D. Takeuchi, Y. Ohishi, N. Harada, K. Kashino, Masked Modeling duo: learning representations by encouraging both networks to model the input. In ICASSP 2023\u20132023 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp. 1\u20135. IEEE (2023). https:\/\/doi.org\/10.48550\/arXiv.2210.14648","DOI":"10.48550\/arXiv.2210.14648"},{"key":"3231_CR43","doi-asserted-by":"publisher","first-page":"495","DOI":"10.1007\/s10772-016-9337-5","volume":"19","author":"KM Nahar","year":"2016","unstructured":"K.M. Nahar, M. Abu Shquier, W.G. Al-Khatib, H. Al-Muhtaseb, M. Elshafei, Arabic phonemes recognition using hybrid LVQ\/HMM model for continuous speech recognition. Int. J. Speech Technol. 19, 495\u2013508 (2016). https:\/\/doi.org\/10.1007\/s10772-016-9337-5","journal-title":"Int. J. Speech Technol."},{"key":"3231_CR44","doi-asserted-by":"publisher","unstructured":"D. Ng, Y. Chen, B. Tian, Q. Fu, and E.S. Chng. Convmixer: feature interactive convolution with curriculum learning for small footprint and noisy far-field keyword spotting. In ICASSP 2022\u20132022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 3603\u20133607. IEEE (2022). https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9747025","DOI":"10.1109\/ICASSP43922.2022.9747025"},{"key":"3231_CR45","doi-asserted-by":"publisher","first-page":"148","DOI":"10.1121\/1.1917300","volume":"23","author":"GE Peterson","year":"1951","unstructured":"G.E. Peterson, H.L. Barney, Control methods used in a study of the vowels. J. Acoust. Soc. Am. 23, 148\u2013148 (1951)","journal-title":"J. Acoust. Soc. Am."},{"issue":"13","key":"3231_CR46","doi-asserted-by":"publisher","first-page":"5668","DOI":"10.1016\/j.eswa.2015.02.036","volume":"42","author":"E Principi","year":"2015","unstructured":"E. Principi, S. Squartini, R. Bonfigli, G. Ferroni, F. Piazza, An integrated system for voice command recognition and emergency detection based on audio signals. Expert Syst. Appl. 42(13), 5668\u20135683 (2015). https:\/\/doi.org\/10.1016\/j.eswa.2015.02.036","journal-title":"Expert Syst. Appl."},{"key":"3231_CR47","doi-asserted-by":"publisher","unstructured":"L. Rabiner, and B.H. Juang, Fundamentals of speech processing. 321\u2013389 (1993). https:\/\/doi.org\/10.5555\/153687","DOI":"10.5555\/153687"},{"key":"3231_CR48","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1561\/2000000001","volume":"1","author":"LR Rabiner","year":"2007","unstructured":"L.R. Rabiner, R.W. Schafer, Introduction to digital speech processing. Found. Trends\u00ae Signal Process. 1, 1\u2013194 (2007). https:\/\/doi.org\/10.1561\/2000000001","journal-title":"Found. Trends\u00ae Signal Process."},{"key":"3231_CR49","doi-asserted-by":"publisher","unstructured":"N. Reimers, and I. Gurevych, Sentence-bert: Sentence embeddings using siamese bert-networks. arXiv preprint (2019). https:\/\/doi.org\/10.48550\/arXiv.1908.10084","DOI":"10.48550\/arXiv.1908.10084"},{"issue":"1","key":"3231_CR50","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1109\/TASSP.1975.1162629","volume":"23","author":"R Schwartz","year":"1975","unstructured":"R. Schwartz, J. Makhoul, Where the phonemes are: dealing with ambiguity in acoustic-phonetic recognition. IEEE Trans. Acoust. Speech Signal Process. 23(1), 50\u201353 (1975). https:\/\/doi.org\/10.1109\/TASSP.1975.1162629","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"key":"3231_CR51","doi-asserted-by":"publisher","first-page":"80682","DOI":"10.1109\/ACCESS.2021.3078715","volume":"9","author":"D Seo","year":"2021","unstructured":"D. Seo, Oh. Heung-Seon, Y. Jung, Wav2kws: transfer learning from speech representations for keyword spotting. IEEE Access 9, 80682\u201380691 (2021). https:\/\/doi.org\/10.1109\/ACCESS.2021.3078715","journal-title":"IEEE Access"},{"key":"3231_CR52","doi-asserted-by":"publisher","unstructured":"C. Shain, and M. Elsner, Measuring the perceptual availability of phonological features during language acquisition using unsupervised binary stochastic autoencoders. In proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, Volume 1 (Long and Short Papers), pp. 69\u201385 (2019). https:\/\/doi.org\/10.18653\/v1\/N19-1007","DOI":"10.18653\/v1\/N19-1007"},{"key":"3231_CR53","doi-asserted-by":"publisher","first-page":"53040","DOI":"10.1109\/ACCESS.2019.2912200","volume":"7","author":"A Shrestha","year":"2019","unstructured":"A. Shrestha, A. Mahmood, Review of deep learning algorithms and architectures. IEEE Access 7, 53040\u201353065 (2019)","journal-title":"IEEE Access"},{"key":"3231_CR54","doi-asserted-by":"publisher","first-page":"62","DOI":"10.35699\/2237-5864.2011.2021","volume":"1","author":"TC Silva","year":"2011","unstructured":"T.C. Silva, H.C. Yehia, Sonoridade em Artes, Sa\u00fade e Tecnologia. Revista Doc\u00eancia do Ensino Superior 1, 62\u201374 (2011)","journal-title":"Revista Doc\u00eancia do Ensino Superior"},{"issue":"6","key":"3231_CR55","doi-asserted-by":"publisher","first-page":"1486","DOI":"10.1109\/78.286964","volume":"42","author":"U Theodor","year":"1994","unstructured":"U. Theodor, U. Shaked, C.E. de Souza, A game theory approach to robust discrete-time H\/sub\/spl infin\/\/-estimation. IEEE Trans. Signal Process. 42(6), 1486\u20131495 (1994). https:\/\/doi.org\/10.1109\/78.286964","journal-title":"IEEE Trans. Signal Process."},{"key":"3231_CR56","doi-asserted-by":"publisher","unstructured":"Tan, Xu, Tao Qin, Frank Soong, and Tie-Yan Liu. \"A survey on neural speech synthesis.\"\u00a0arXiv preprint \u00a0(2021). https:\/\/doi.org\/10.48550\/arXiv.2106.15561","DOI":"10.48550\/arXiv.2106.15561"},{"key":"3231_CR57","doi-asserted-by":"publisher","unstructured":"V.A. Trinh, H.S. Kavaki, and M.I. Mandel, Importantaug: a data augmentation agent for speech. In ICASSP 2022\u20132022 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp. 8592\u20138596 (2022). IEEE https:\/\/doi.org\/10.48550\/arXiv.2112.07156","DOI":"10.48550\/arXiv.2112.07156"},{"key":"3231_CR58","doi-asserted-by":"publisher","unstructured":"T. Tu, Y.-J. Chen, C.-c. Yeh, and H.-Y. Lee, End-to-end text-to-speech for low-resource languages by cross-lingual transfer learning. arXiv preprint \u00a0(2019). https:\/\/doi.org\/10.48550\/arXiv.1904.06508","DOI":"10.48550\/arXiv.1904.06508"},{"key":"3231_CR59","doi-asserted-by":"crossref","unstructured":"T. Virtanen, R. Singh, and B. Raj, Techniques for noise robustness in automatic speech recognition. (John Wiley & Sons, 2012)","DOI":"10.1002\/9781118392683"},{"key":"3231_CR60","doi-asserted-by":"publisher","unstructured":"R. Vygon, and N. Mikhaylovskiy, Learning efficient representations for keyword spotting with triplet loss. In speech and computer: 23rd international conference, SPECOM 2021, St. Petersburg, Russia, September 27\u201330, 2021, Proceedings 23, pp. 773\u2013785. Springer International Publishing, 2021. https:\/\/doi.org\/10.1007\/978-3-030-87802-3_69","DOI":"10.1007\/978-3-030-87802-3_69"},{"key":"3231_CR61","doi-asserted-by":"publisher","unstructured":"P. Warden, Speech commands: a dataset for limited-vocabularyspeech recognition. (2018) arXiv preprint . https:\/\/doi.org\/10.48550\/arXiv.1804.03209","DOI":"10.48550\/arXiv.1804.03209"},{"key":"3231_CR62","doi-asserted-by":"publisher","unstructured":"A.S. Wazir and J.H. Chuah, Spoken Arabic digits recognition using deep learning. In 2019 IEEE international conference on automatic control and intelligent systems (I2CACIS), pp. 339\u2013344 (2019). IEEE https:\/\/doi.org\/10.1109\/I2CACIS.2019.8825004","DOI":"10.1109\/I2CACIS.2019.8825004"},{"key":"3231_CR63","doi-asserted-by":"publisher","unstructured":"Y. Wei, Z. Gong, S. Yang, K. Ye, Y. Wen, EdgeCRNN: an edge-computing oriented model of acoustic feature enhancement for keyword spotting. J. Amb. Intell. Humanized Comput. 1\u201311 (2022)https:\/\/doi.org\/10.1007\/s12652-021-03022-1","DOI":"10.1007\/s12652-021-03022-1"},{"key":"3231_CR64","doi-asserted-by":"publisher","unstructured":"D. Wells, and K. Richmond, Cross-lingual transfer of phonological features for low-resource speech synthesis. In Proc. 11th ISCA Speech Synth. Workshop, pp. 160\u2013165 (2021). https:\/\/doi.org\/10.21437\/SSW.2021-28","DOI":"10.21437\/SSW.2021-28"},{"key":"3231_CR65","doi-asserted-by":"publisher","first-page":"102557","DOI":"10.1016\/j.system.2021.102557","volume":"100","author":"M Yenkimaleki","year":"2021","unstructured":"M. Yenkimaleki, V.J. van Heuven, Effects of attention to segmental vs suprasegmental features on the speech intelligibility and comprehensibility of the EFL learners targeting the perception or production-focused practice. System 100, 102557 (2021). https:\/\/doi.org\/10.1016\/j.system.2021.102557","journal-title":"System"},{"key":"3231_CR66","doi-asserted-by":"publisher","first-page":"10767","DOI":"10.1109\/ACCESS.2019.2891838","volume":"7","author":"M Zeng","year":"2019","unstructured":"M. Zeng, N. Xiao, Effective combination of DenseNet and BiLSTM for keyword spotting. IEEE Access 7, 10767\u201310775 (2019). https:\/\/doi.org\/10.1109\/ACCESS.2019.2891838","journal-title":"IEEE Access"},{"issue":"4","key":"3231_CR67","doi-asserted-by":"publisher","first-page":"673","DOI":"10.26599\/TST.2022.9010038","volume":"28","author":"Q Zhang","year":"2023","unstructured":"Q. Zhang, H. Zhang, K. Zhou, L. Zhang, Developing a physiological signal-based, mean threshold and decision-level fusion algorithm (PMD) for Emotion recognition. Tsinghua Sci Technol 28(4), 673\u2013685 (2023). https:\/\/doi.org\/10.26599\/TST.2022.9010038","journal-title":"Tsinghua Sci Technol"},{"key":"3231_CR68","doi-asserted-by":"publisher","first-page":"111","DOI":"10.1016\/j.inffus.2022.09.012","volume":"90","author":"J Zhu","year":"2023","unstructured":"J. Zhu, C. Huang, P. De Meo, DFMKE: a dual fusion multi-modal knowledge graph embedding framework for entity alignment. Inform Fusion 90, 111\u2013119 (2023). https:\/\/doi.org\/10.1016\/j.inffus.2022.09.012","journal-title":"Inform Fusion"},{"key":"3231_CR69","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1007\/s10772-018-09573-7","volume":"22","author":"T Zia","year":"2019","unstructured":"T. Zia, U. Zahid, Long short-term memory recurrent neural network architectures for Urdu acoustic modeling. Int. J. Speech Technol. 22, 21\u201330 (2019). https:\/\/doi.org\/10.1007\/s10772-018-09573-7","journal-title":"Int. J. Speech Technol."}],"container-title":["Circuits, Systems, and Signal Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00034-025-03231-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00034-025-03231-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00034-025-03231-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,30]],"date-time":"2025-11-30T03:28:55Z","timestamp":1764473335000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00034-025-03231-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,28]]},"references-count":69,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2025,12]]}},"alternative-id":["3231"],"URL":"https:\/\/doi.org\/10.1007\/s00034-025-03231-5","relation":{},"ISSN":["0278-081X","1531-5878"],"issn-type":[{"type":"print","value":"0278-081X"},{"type":"electronic","value":"1531-5878"}],"subject":[],"published":{"date-parts":[[2025,7,28]]},"assertion":[{"value":"8 December 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 June 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 June 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 July 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors state that they have no known financial or personal interests that could have impacted the conclusions described in this work.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}