{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,13]],"date-time":"2025-09-13T15:29:50Z","timestamp":1757777390471,"version":"3.37.3"},"reference-count":66,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2023,11,11]],"date-time":"2023-11-11T00:00:00Z","timestamp":1699660800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,11,11]],"date-time":"2023-11-11T00:00:00Z","timestamp":1699660800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1007\/s10772-023-10053-w","type":"journal-article","created":{"date-parts":[[2023,11,11]],"date-time":"2023-11-11T05:01:40Z","timestamp":1699678900000},"page":"903-918","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["End-to-end ASR framework for Indian-English accent: using speech CNN-based segmentation"],"prefix":"10.1007","volume":"26","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1400-1528","authenticated-orcid":false,"given":"Ghayas","family":"Ahmed","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4072-2043","authenticated-orcid":false,"given":"Aadil Ahmad","family":"Lawaye","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,11,11]]},"reference":[{"key":"10053_CR1","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/S41870-023-01466-6","volume":"2023","author":"G Ahmed","year":"2023","unstructured":"Ahmed, G., & Lawaye, A. A. (2023). CNN-based speech segments endpoints detection framework using short-time signal energy features. International Journal of Information Technology, 2023, 1\u201313. https:\/\/doi.org\/10.1007\/S41870-023-01466-6","journal-title":"International Journal of Information Technology"},{"key":"10053_CR2","unstructured":"Aytar, Y., & Vondrick, C. (n.d.). A. T.-A. in neural, and undefined 2016. \u201cSoundNet: Learning sound representations from unlabeled video. proceedings.neurips.cc. Retrieved May 24, 2023, from https:\/\/proceedings.neurips.cc\/paper\/2016\/hash\/7dcd340d84f762eba80aa538b0c527f7-Abstract.html"},{"key":"10053_CR3","unstructured":"Ba, J. L., Kiros, J. R., & Hinton, G. E. (2016). Layer normalization. Retrieved September 24, 2023, from http:\/\/arxiv.org\/abs\/1607.06450"},{"key":"10053_CR4","unstructured":"Baevski, A., Auli, M., & Mohamed, A. (2019). Effectiveness of self-supervised pre-training for speech recognition. Retrieved September 24, 2023, from http:\/\/arxiv.org\/abs\/1911.03912"},{"key":"10053_CR5","unstructured":"Baevski, A., Schneider, S., & Auli, M. (2020). VQ-WAV2VEC: Self-supervised learning of discrete speech representations. In 8th international conference on learning representations, (ICLR 2020)."},{"key":"10053_CR6","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1007\/978-981-15-0947-6_4","volume":"1076","author":"F Barkani","year":"2020","unstructured":"Barkani, F., Satori, H., Hamidi, M., Zealouk, O., & Laaidi, N. (2020). Comparative evaluation of speech recognition systems based on different toolkits. Advances in Intelligent Systems and Computing, 1076, 33\u201341. https:\/\/doi.org\/10.1007\/978-981-15-0947-6_4","journal-title":"Advances in Intelligent Systems and Computing"},{"key":"10053_CR7","doi-asserted-by":"publisher","unstructured":"Basbug, A. M., & Sert, M. (2019). Analysis of deep neural network models for acoustic scene classification. In 27th signal processing and communications applications conference, (SIU 2019). https:\/\/doi.org\/10.1109\/SIU.2019.8806301","DOI":"10.1109\/SIU.2019.8806301"},{"key":"10053_CR8","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/J.SPECOM.2018.02.003","volume":"99","author":"H Benisty","year":"2018","unstructured":"Benisty, H., Katz, I., Crammer, K., & Malah, D. (2018). Discriminative keyword spotting for limited-data applications. Speech Communication, 99, 1\u201311. https:\/\/doi.org\/10.1016\/J.SPECOM.2018.02.003","journal-title":"Speech Communication"},{"key":"10053_CR9","doi-asserted-by":"publisher","unstructured":"Chen, L., Zheng, X., Zhang, C., Guo, L., & Yu, B. (2022). Multi-scale temporal-frequency attention for music source separation. In Proceedings\u2014IEEE international conference on multimedia and expo, July 2022. https:\/\/doi.org\/10.1109\/ICME52920.2022.9859957","DOI":"10.1109\/ICME52920.2022.9859957"},{"key":"10053_CR10","doi-asserted-by":"publisher","unstructured":"Cho, J., Yun, S., Park, H., Eum, J., & Hwang, K. (2019). Acoustic scene classification based on a large-margin factorized CNN (pp. 45\u201349). https:\/\/doi.org\/10.33682\/8XH4-JM46","DOI":"10.33682\/8XH4-JM46"},{"key":"10053_CR11","doi-asserted-by":"publisher","unstructured":"Choi, K., Fazekas, G., Sandler, M., & Cho, K. (2017). Convolutional recurrent neural networks for music classification. In ICASSP, IEEE international conference on acoustics, speech and signal processing\u2014Proceedings, June 2017 (pp. 2392\u20132396). https:\/\/doi.org\/10.1109\/ICASSP.2017.7952585.","DOI":"10.1109\/ICASSP.2017.7952585"},{"key":"10053_CR12","unstructured":"Demir, F., Abdullah, D. (n.d.). A. S.-I. Access, and undefined 2020. A new deep CNN model for environmental sound classification. ieeexplore.ieee.org. Retrieved May 24, 2023, from https:\/\/ieeexplore.ieee.org\/abstract\/document\/9052658\/"},{"key":"10053_CR13","unstructured":"Devlin, J., Chang, M. W., Lee, K., & Toutanova, K. (2019). BERT: Pre-training of deep bidirectional transformers for language understanding. In NAACL HLT 2019\u20142019 conference of the North American chapter of the association for computational linguistics: Human language technologies\u2014Proceedings of the conference, 2019 (Vol. 1, pp. 4171\u20134186)."},{"key":"10053_CR14","doi-asserted-by":"publisher","unstructured":"Dong, M. (2019). Convolutional neural network achieves human-level accuracy in music genre classification. https:\/\/doi.org\/10.32470\/CCN.2018.1153-0","DOI":"10.32470\/CCN.2018.1153-0"},{"key":"10053_CR15","doi-asserted-by":"crossref","unstructured":"D\u00f6rfler, M., & Bammer, R. (n.d.). T. G.-2017 international conference on, and undefined 2017. Inside the spectrogram: Convolutional neural networks in audio processing. ieeexplore.ieee.org. Retrieved May 24, 2023, from https:\/\/ieeexplore.ieee.org\/abstract\/document\/8024472\/","DOI":"10.1109\/SAMPTA.2017.8024472"},{"key":"10053_CR16","doi-asserted-by":"publisher","unstructured":"Guzhov, A., Raue, F., Hees, J., & Dengel, A. (2020). EsResNet: Environmental sound classification based on visual domain models. In Proceedings\u2014International conference on pattern recognition, 2020 (pp. 8819\u20138825). https:\/\/doi.org\/10.1109\/ICPR48806.2021.9413035","DOI":"10.1109\/ICPR48806.2021.9413035"},{"key":"10053_CR17","unstructured":"Haflan, V. (2019). Noise robustness in small-vocabulary speech recognition. Retrieved September 25, 2023, from https:\/\/ntnuopen.ntnu.no\/ntnu-xmlui\/handle\/11250\/2613396"},{"key":"10053_CR18","unstructured":"Hatala, Z. (2019). Practical speech recognition with HTK. Retrieved September 25, 2023, from http:\/\/arxiv.org\/abs\/1908.02119"},{"key":"10053_CR19","unstructured":"Hemakumar, G., & P. P.-I. J. of S., and undefined 2014. Automatic segmentation of Kannada speech signal into syllables and sub-words: Noised and noiseless signals. Retrieved May 24, 2023, from https:\/\/www.academia.edu\/download\/34681327\/Automatic-Segmentation-of-Kannada-Speech-Signal.pdf"},{"key":"10053_CR20","unstructured":"Hendrycks, D., & Gimpel, K. (2016). Gaussian error linear units (GELUs). Retrieved September 24, 2023, from http:\/\/arxiv.org\/abs\/1606.08415"},{"key":"10053_CR21","unstructured":"Hershey, S., et al. (n.d.). CNN architectures for large-scale audio classification. ieeexplore.ieee.org. Retrieved May 24, 2023, from https:\/\/ieeexplore.ieee.org\/abstract\/document\/7952132\/"},{"key":"10053_CR22","doi-asserted-by":"publisher","first-page":"161109","DOI":"10.1109\/ACCESS.2020.3020696","volume":"8","author":"I Hwang","year":"2020","unstructured":"Hwang, I., & Chang, J. H. (2020). End-to-end speech endpoint detection utilizing acoustic and language modeling knowledge for online low-latency speech recognition. IEEE Access, 8, 161109\u2013161123. https:\/\/doi.org\/10.1109\/ACCESS.2020.3020696","journal-title":"IEEE Access"},{"key":"10053_CR23","doi-asserted-by":"publisher","first-page":"227","DOI":"10.1007\/978-3-030-93247-3_23","volume":"371","author":"MM Islam","year":"2022","unstructured":"Islam, M. M., Haque, M., Islam, S., Mia, M. Z. A., & Rahman, S. M. A. M. (2022). DCNN\u2013LSTM based audio classification combining multiple feature engineering and data augmentation techniques. Lecture Notes in Networks and Systems, 371, 227\u2013236. https:\/\/doi.org\/10.1007\/978-3-030-93247-3_23","journal-title":"Lecture Notes in Networks and Systems"},{"key":"10053_CR24","doi-asserted-by":"publisher","unstructured":"Jegou, H., & Douze, M. (2011). C. S.-I. transactions on pattern, and undefined 2010. Product quantization for nearest neighbor search. ieeexplore.ieee.org. In H. Jegou, M. Douze & C. Schmid (Eds), IEEE transactions on pattern analysis and machine intelligence, 2010 (Vol. 33, no. 1, pp. 117\u2013128). https:\/\/doi.org\/10.1109\/TPAMI.2010.57i","DOI":"10.1109\/TPAMI.2010.57i"},{"key":"10053_CR25","doi-asserted-by":"publisher","unstructured":"Jongman, S., Khoe, Y. (n.d.). & Hintz, F. (2021). Vocabulary size influences spontaneous speech in native language users: Validating the use of automatic speech recognition in individual differences research. Language and Speech, 64(1), 35\u201351. https:\/\/doi.org\/10.1177\/0023830920911079","DOI":"10.1177\/0023830920911079"},{"key":"10053_CR26","doi-asserted-by":"publisher","unstructured":"Ketkar, N., & Moolayil, J. (2021). Convolutional neural networks. In Deep learning with Python (pp. 197\u2013242). https:\/\/doi.org\/10.1007\/978-1-4842-5364-9_6","DOI":"10.1007\/978-1-4842-5364-9_6"},{"key":"10053_CR27","doi-asserted-by":"publisher","unstructured":"Kudin, O., Kryvokhata, A., & Gorbenko, V. I. (2020). Developing a deep learning sound classification system for a smart farming. In ECS meeting abstracts (Vol. MA2020-01(26), pp. 1853\u20131853). https:\/\/doi.org\/10.1149\/MA2020-01261853MTGABS\/META.M","DOI":"10.1149\/MA2020-01261853MTGABS\/META.M"},{"key":"10053_CR28","unstructured":"Lee, J., Park, J., Kim, K. L., & Nam, J. (2019). Sample-level deep convolutional neural networks for music auto-tagging using raw waveforms. In Proceedings of the 14th sound and music computing conference 2017, (SMC 2017) (pp. 220\u2013226)."},{"key":"10053_CR29","doi-asserted-by":"publisher","unstructured":"Li, X., Chebiyyam, V., & Kirchhoff, K. (2019). Multi-stream network with temporal attention for environmental sound classification. In Proceedings of the annual conference of the international speech communication association, (Interspeech 2019) (pp. 3604\u20133608). https:\/\/doi.org\/10.21437\/Interspeech.2019-3019","DOI":"10.21437\/Interspeech.2019-3019"},{"key":"10053_CR30","doi-asserted-by":"publisher","first-page":"287","DOI":"10.1016\/J.NEUCOM.2020.08.092","volume":"445","author":"Y Lin","year":"2021","unstructured":"Lin, Y., Li, Q., Yang, B., Yan, Z., Tan, H., & Chen, Z. (2021). Improving speech recognition models with small samples for air traffic control systems. Neurocomputing, 445, 287\u2013297. https:\/\/doi.org\/10.1016\/J.NEUCOM.2020.08.092","journal-title":"Neurocomputing"},{"key":"10053_CR33","doi-asserted-by":"crossref","unstructured":"Liu, B., Hoffmeister, B., & Rastrow, A. (2015). Accurate endpointing with expected pause duration.","DOI":"10.21437\/Interspeech.2015-449"},{"key":"10053_CR32","unstructured":"Liu, Y., et al. (2019). RoBERTa: A robustly optimized BERT pretraining approach. Retrieved September 24, 2023, from http:\/\/arxiv.org\/abs\/1907.11692"},{"key":"10053_CR31","doi-asserted-by":"crossref","unstructured":"Liu, Y., Iyer, R., Kirchhoff, K., & Bilmes, J. (n.d.). SVitchboard II and FiSVer I: High-quality limited-complexity corpora of conversational English speech. people.ece.uw.edu. Retrieved September 25, 2023, from https:\/\/people.ece.uw.edu\/bilmes\/p\/mypubs\/liu-svb-ii-interspeech-2015.pdf","DOI":"10.21437\/Interspeech.2015-238"},{"key":"10053_CR35","unstructured":"Maas, R., et al. (n.d.). Combining acoustic embeddings and decoding features for end-of-utterance detection in real-time far-field speech recognition systems. ieeexplore.ieee.org. Retrieved May 27, 2023, from https:\/\/ieeexplore.ieee.org\/abstract\/document\/8461478\/"},{"key":"10053_CR34","doi-asserted-by":"crossref","unstructured":"Maas, R., Rastrow, A., Goehner, K., Tiwari, G., & Joseph, S. (2017). Domain-specific utterance end-point detection for speech recognition. Retrieved May 27, 2023, from https:\/\/www.amazon.science\/publications\/domain-specific-utterance-end-point-detection-for-speech-recognition","DOI":"10.21437\/Interspeech.2017-1673"},{"issue":"1","key":"10053_CR36","doi-asserted-by":"publisher","first-page":"295","DOI":"10.1016\/J.CSL.2013.07.003","volume":"28","author":"MW Mak","year":"2014","unstructured":"Mak, M. W., & Yu, H. B. (2014). A study of voice activity detection techniques for NIST speaker recognition evaluations. Computer Speech and Language, 28(1), 295\u2013313. https:\/\/doi.org\/10.1016\/J.CSL.2013.07.003","journal-title":"Computer Speech and Language"},{"issue":"1","key":"10053_CR37","doi-asserted-by":"publisher","first-page":"71","DOI":"10.1007\/S10772-020-09673-3","volume":"23","author":"P Mittal","year":"2020","unstructured":"Mittal, P., & Singh, N. (2020). Subword analysis of small vocabulary and large vocabulary ASR for Punjabi language. International Journal of Speech Technology, 23(1), 71\u201378. https:\/\/doi.org\/10.1007\/S10772-020-09673-3","journal-title":"International Journal of Speech Technology"},{"key":"10053_CR38","unstructured":"Mohamed, A., Okhonko, D., & Zettlemoyer, L. (2019). Transformers with convolutional context for ASR. Retrieved September 24, 2023, from http:\/\/arxiv.org\/abs\/1904.11660"},{"key":"10053_CR39","doi-asserted-by":"publisher","unstructured":"Moreno, I. L., Wan, L., Wang, Q., Ding, S., & Chang, S. (2019). Personal VAD: Speaker-conditioned voice activity detection (pp. 433\u2013439). https:\/\/doi.org\/10.21437\/odyssey.2020-62","DOI":"10.21437\/odyssey.2020-62"},{"issue":"6","key":"10053_CR40","doi-asserted-by":"publisher","first-page":"1261","DOI":"10.1109\/TASL.2013.2248717","volume":"21","author":"S Mousazadeh","year":"2013","unstructured":"Mousazadeh, S., & Cohen, I. (2013). Voice activity detection in presence of transient noise using spectral clustering. IEEE Transactions on Audio, Speech, and Language Processing, 21(6), 1261\u20131271. https:\/\/doi.org\/10.1109\/TASL.2013.2248717","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"key":"10053_CR41","doi-asserted-by":"publisher","unstructured":"Nguyen, T. (n.d.). F. P.-C. of the I. E. in, and undefined 2020. Lung sound classification using snapshot ensemble of convolutional neural networks. ieeexplore.ieee.org July 2020 (pp. 760\u2013763). https:\/\/doi.org\/10.1109\/EMBC44109.2020.9176076","DOI":"10.1109\/EMBC44109.2020.9176076"},{"key":"10053_CR42","unstructured":"Niranjan, K. (n.d.). S. V.-2021 F. International, and undefined 2021. Ensemble and multi model approach to environmental sound classification. ieeexplore.ieee.org. Retrieved May 24, 2023, from https:\/\/ieeexplore.ieee.org\/abstract\/document\/9616775\/"},{"key":"10053_CR43","unstructured":"Noughreche, A., Boulouma, S., Benbaghdad, M., Adnene, N., Sabri, B., & Mohammed, B. (2021). Design and implementation of an automatic speech recognition based voice control system. easychair.org. In N. Adnene, B. Sabri & B. Mohammed (Eds), Conference on electrical engineering 2021. Retrieved September 25, 2023, from https:\/\/easychair.org\/publications\/preprint_download\/wzRf"},{"key":"10053_CR44","unstructured":"Ouyang, Z., Yu, H., Zhu, W.-P., & Champagne, B. (n.d.). A fully convolutional neural network for complex spectrogram processing in speech enhancement. ieeexplore.ieee.org. Retrieved May 24, 2023, from https:\/\/ieeexplore.ieee.org\/abstract\/document\/8683423\/"},{"key":"10053_CR45","doi-asserted-by":"publisher","unstructured":"Park, D. S., et al. (2019). Specaugment: A simple data augmentation method for automatic speech recognition. In Proceedings of the annual conference of the international speech communication association, (Interspeech 2019) (pp. 2613\u20132617). https:\/\/doi.org\/10.21437\/Interspeech.2019-2680","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"10053_CR46","doi-asserted-by":"publisher","unstructured":"Prombut, N., Waijanya, S., Promrit, N. (2021). Feature extraction technique based on Conv1D and Conv2D network for Thai speech emotion recognition. In ACM international conference proceeding series, December 2021 (pp. 54\u201360). https:\/\/doi.org\/10.1145\/3508230.3508238.","DOI":"10.1145\/3508230.3508238"},{"issue":"2","key":"10053_CR47","doi-asserted-by":"publisher","first-page":"297","DOI":"10.1002\/J.1538-7305.1975.TB02840.X","volume":"54","author":"LR Rabiner","year":"1975","unstructured":"Rabiner, L. R., & Sambur, M. R. (1975). An algorithm for determining the endpoints of isolated utterances. Bell System Technical Journal, 54(2), 297\u2013315. https:\/\/doi.org\/10.1002\/J.1538-7305.1975.TB02840.X","journal-title":"Bell System Technical Journal"},{"key":"10053_CR49","unstructured":"Rahman, M. (n.d.). M. B.-I. J. of Advanced, and undefined 2012. Continuous Bangla speech segmentation using short-term speech features extraction approaches. academia.edu. Retrieved May 24, 2023, from https:\/\/www.academia.edu\/download\/59515251\/Cashless_Society_pg_197-20320190604-52015-1ydu74l.pdf#page=143"},{"key":"10053_CR48","unstructured":"Rahman, M., Khatun, F. (n.d.). M. B.-E. Preface, and undefined 2015. Blocking black area method for speech segmentation. Citeseer. Retrieved May 24, 2023, from https:\/\/citeseerx.ist.psu.edu\/document?repid=rep1&type=pdf&doi=fbe239a538e7e07f72a5f95535e671a6baa3d1c9#page=9"},{"key":"10053_CR50","unstructured":"Roberts, A., Engel, J., Raffel, C., Hawthorne, C., & Eck, D. (2018). A hierarchical latent vector model for learning long-term structure in music. In Proceedings. MLR Press. Retrieved May 24, 2023, from http:\/\/proceedings.mlr.press\/v80\/roberts18a.html"},{"key":"10053_CR51","unstructured":"Scheirer, E. (n.d.). M. S.-1997 I. international conference on, and undefined 1997. Construction and evaluation of a robust multifeature speech\/music discriminator. ieeexplore.ieee.org. Retrieved May 24, 2023, from https:\/\/ieeexplore.ieee.org\/abstract\/document\/596192\/"},{"key":"10053_CR52","unstructured":"Si, S., et al. (n.d.). Variational information bottleneck for effective low-resource audio classification. arxiv.org. Retrieved May 24, 2023, from https:\/\/arxiv.org\/abs\/2107.04803"},{"key":"10053_CR53","unstructured":"Snyder, D., Chen, G., & Povey, D. (n.d.). MUSAN: A music, speech, and noise corpus. Retrieved May 24, 2023, from http:\/\/arxiv.org\/abs\/1510.08484"},{"key":"10053_CR54","doi-asserted-by":"publisher","unstructured":"Su, Y., Zhang, K., Wang, J. (n.d.). K. M.- Sensors, and undefined 2019. Environment sound classification using a two-stream CNN based on decision-level fusion. mdpi.com (Vol. 19, no. 7). https:\/\/doi.org\/10.3390\/s19071733","DOI":"10.3390\/s19071733"},{"key":"10053_CR55","doi-asserted-by":"crossref","unstructured":"Supriya, S. (n.d.). S. H.-2017 I. I. Conference, and undefined 2017. Speech recognition using HTK toolkit for Marathi language. ieeexplore.ieee.org. In S. Supriya & Handore, S. M. (Eds), 2017 IEEE international conference on power, control, signals. Retrieved September 25, 2023, from https:\/\/ieeexplore.ieee.org\/abstract\/document\/8391979\/","DOI":"10.1109\/ICPCSI.2017.8391979"},{"key":"10053_CR56","unstructured":"T. G. I. and Telecommunications, vol. PhD, and undefined 2009. Study and application of acoustic information for the detection of harmful content, and fusion with visual information. cgi.di.uoa.gr 2009. Retrieved May 24, 2023, from http:\/\/cgi.di.uoa.gr\/~tyiannak\/phdText.pdf"},{"issue":"15","key":"10053_CR57","doi-asserted-by":"publisher","first-page":"6695","DOI":"10.3390\/APP11156695","volume":"11","author":"C Tejedor-Garc\u00eda","year":"2021","unstructured":"Tejedor-Garc\u00eda, C., Carde\u00f1oso-Payo, V., & Escudero-Mancebo, D. (2021). Automatic speech recognition (ASR) systems applied to pronunciation assessment of L2 Spanish for Japanese speakers. Applied Sciences, 11(15), 6695. https:\/\/doi.org\/10.3390\/APP11156695","journal-title":"Applied Sciences"},{"key":"10053_CR58","unstructured":"Theera-Umpon, N., et al. (n.d.). Thai phoneme segmentation using dual-band energy contour. researchgate.net. Retrieved May 24, 2023, from https:\/\/www.researchgate.net\/profile\/Nipon-Theera-Umpon\/publication\/266067316_Thai_Phoneme_Segmentation_using_Dual-Band_Energy_Contour\/links\/569dcae708ae950bd7a6b277\/Thai-Phoneme-Segmentation-using-Dual-Band-Energy-Contour.pdf"},{"key":"10053_CR59","doi-asserted-by":"crossref","unstructured":"Tokozume, Y. (n.d.). T. H.-2017 I. international conference on, and undefined 2017. Learning environmental sounds with end-to-end convolutional neural network. ieeexplore.ieee.org. Retrieved May 24, 2023, from https:\/\/ieeexplore.ieee.org\/abstract\/document\/7952651\/","DOI":"10.1109\/ICASSP.2017.7952651"},{"key":"10053_CR60","doi-asserted-by":"publisher","unstructured":"Tzanetakis, G. (n.d.). P. C.-I. T. on speech and, and undefined 2002. Musical genre classification of audio signals. ieeexplore.ieee.org (Vol. 10, no. 5, p. 293). https:\/\/doi.org\/10.1109\/TSA.2002.800560","DOI":"10.1109\/TSA.2002.800560"},{"key":"10053_CR61","unstructured":"Van Den Oord, A., et al. (n.d.). WaveNet: A generative model for raw audio. arxiv.org. Retrieved May 24, 2023, from https:\/\/arxiv.org\/abs\/1609.03499"},{"key":"10053_CR62","unstructured":"Vaswani, A., et al. (n.d.). Attention is all you need. proceedings.neurips.cc. Retrieved September 24, 2023, from https:\/\/proceedings.neurips.cc\/paper\/7181-attention-is-all"},{"key":"10053_CR63","unstructured":"Vidhya, J. (n.d.). R. U.-P. of the Algorithms, C. and, and undefined 2021. Violence detection in videos using Conv2D VGG-19 architecture and LSTM network. ceur-ws.org, 2021. Retrieved May 24, 2023, from http:\/\/ceur-ws.org\/Vol-3010\/PAPER_09.pdf"},{"key":"10053_CR64","unstructured":"Wu, F., Fan, A., Baevski, A., Dauphin, Y. N., & Auli, M. (2019). Pay less attention with lightweight and dynamic convolutions. In 7th international conference on learning representations, (ICLR 2019)."},{"key":"10053_CR65","unstructured":"Zhang, T. (n.d.). C. K.-1999 I. I. C. on, and undefined 1999. Hierarchical classification of audio data for archiving and retrieving. ieeexplore.ieee.org. Retrieved May 24, 2023, from https:\/\/ieeexplore.ieee.org\/abstract\/document\/757472\/"},{"key":"10053_CR66","doi-asserted-by":"crossref","unstructured":"Zhang, W., Lei, W., Xu, X. (n.d.). X. X.- Interspeech, and undefined 2016. Improved music genre classification with convolutional neural networks. isca-speech.org. Retrieved May 24, 2023, from https:\/\/www.isca-speech.org\/archive_v0\/Interspeech_2016\/pdfs\/1236.PDF","DOI":"10.21437\/Interspeech.2016-1236"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-023-10053-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10772-023-10053-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-023-10053-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,11]],"date-time":"2024-01-11T10:13:21Z","timestamp":1704968001000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10772-023-10053-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11,11]]},"references-count":66,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2023,12]]}},"alternative-id":["10053"],"URL":"https:\/\/doi.org\/10.1007\/s10772-023-10053-w","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"type":"print","value":"1381-2416"},{"type":"electronic","value":"1572-8110"}],"subject":[],"published":{"date-parts":[[2023,11,11]]},"assertion":[{"value":"2 June 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 September 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 November 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"There are no apparent conflicts of interest for the authors to disclose.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}