{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,5]],"date-time":"2025-12-05T12:23:56Z","timestamp":1764937436719,"version":"3.37.3"},"reference-count":71,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2022,7,20]],"date-time":"2022-07-20T00:00:00Z","timestamp":1658275200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,7,20]],"date-time":"2022-07-20T00:00:00Z","timestamp":1658275200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2023,1]]},"DOI":"10.1007\/s11042-022-13435-5","type":"journal-article","created":{"date-parts":[[2022,7,20]],"date-time":"2022-07-20T02:02:16Z","timestamp":1658282536000},"page":"3973-3994","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["Prosody features based low resource Punjabi children ASR and T-NT classifier using data augmentation"],"prefix":"10.1007","volume":"82","author":[{"given":"Virender","family":"Kadyan","sequence":"first","affiliation":[]},{"given":"Taniya","family":"Hasija","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5884-3145","authenticated-orcid":false,"given":"Amitoj","family":"Singh","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,7,20]]},"reference":[{"issue":"2","key":"13435_CR1","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1007\/s10772-010-9088-7","volume":"14","author":"MA Anusuya","year":"2011","unstructured":"Anusuya MA, Katti SK (2011) Front end analysis of speech recognition: a review. Int J Speech Technol 14(2):99\u2013145. https:\/\/doi.org\/10.1007\/s10772-010-9088-7","journal-title":"Int J Speech Technol"},{"key":"13435_CR2","unstructured":"Balam J, Huang J, Lavrukhin V, Deng S, Majumdar S, Ginsburg B (2020) Improving noise robustness of an end-to-end neural model for automatic speech recognition. https:\/\/arxiv.org\/abs\/2010.12715"},{"key":"13435_CR3","doi-asserted-by":"publisher","first-page":"107810","DOI":"10.1016\/j.apacoust.2020.107810","volume":"175","author":"P Bawa","year":"2021","unstructured":"Bawa P, Kadyan V (2021) Noise robust in-domain children speech enhancement for automatic Punjabi recognition system under mismatched conditions. Appl Acoust 175:107810","journal-title":"Appl Acoust"},{"key":"13435_CR4","doi-asserted-by":"publisher","unstructured":"Benzeghiba M, De Mori R, Deroo O et al (2007) Automatic speech recognition and speech variability: a review. Speech Comm 49(10\u201311):763\u2013786. https:\/\/doi.org\/10.1016\/j.specom.2007.02.006","DOI":"10.1016\/j.specom.2007.02.006"},{"key":"13435_CR5","doi-asserted-by":"crossref","unstructured":"Billa J (2018). ISI ASR system for the low resource speech recognition challenge for Indian languages. In INTERSPEECH 3207\u20133211","DOI":"10.21437\/Interspeech.2018-2473"},{"key":"13435_CR6","doi-asserted-by":"publisher","unstructured":"Du C, Yu K (2020) Speaker augmentation for low resource speech recognition. In ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE 7719\u20137723. https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053139","DOI":"10.1109\/ICASSP40776.2020.9053139"},{"issue":"3","key":"13435_CR7","doi-asserted-by":"publisher","first-page":"389","DOI":"10.1016\/j.jestch.2018.04.005","volume":"21","author":"M Dua","year":"2018","unstructured":"Dua M, Aggarwal RK, Biswas M (2018) Performance evaluation of Hindi speech recognition system using optimized filterbanks. Engineering Science and Technology 21(3):389\u2013398. https:\/\/doi.org\/10.1016\/j.jestch.2018.04.005","journal-title":"Engineering Science and Technology"},{"issue":"10","key":"13435_CR8","doi-asserted-by":"publisher","first-page":"6747","DOI":"10.1007\/s00521-018-3499-9","volume":"31","author":"M Dua","year":"2019","unstructured":"Dua M, Aggarwal RK, Biswas M (2019a) Discriminatively trained continuous Hindi speech recognition system using interpolated recurrent neural network language modeling. Neural Comput & Applic 31(10):6747\u20136755","journal-title":"Neural Comput & Applic"},{"issue":"6","key":"13435_CR9","doi-asserted-by":"publisher","first-page":"2301","DOI":"10.1007\/s12652-018-0828-x","volume":"10","author":"M Dua","year":"2019","unstructured":"Dua M, Aggarwal RK, Biswas M (2019b) GFCC based discriminatively trained noise robust continuous ASR system for Hindi language. J Ambient Intell Humaniz Comput 10(6):2301\u20132314. https:\/\/doi.org\/10.1007\/s12652-018-0828-x","journal-title":"J Ambient Intell Humaniz Comput"},{"issue":"4","key":"13435_CR10","first-page":"359","volume":"9","author":"M Dua","year":"2012","unstructured":"Dua M, Aggarwal RK, Kadyan V, Dua S (2012) Punjabi automatic speech recognition using HTK. Int J Comput Sci Issues (IJCSI) 9(4):359","journal-title":"Int J Comput Sci Issues (IJCSI)"},{"key":"13435_CR11","volume-title":"Why is speech recognition difficult","author":"M Forsberg","year":"2003","unstructured":"Forsberg M (2003) Why is speech recognition difficult. Chalmers University of Technology"},{"key":"13435_CR12","doi-asserted-by":"publisher","unstructured":"Geng M, Xie X, Liu S, Yu J, Hu S, Liu X, Meng H (2020) Investigation of data augmentation techniques for disordered speech recognition. Proc. Interspeech 696\u2013700. https:\/\/doi.org\/10.21437\/Interspeech.2020-1161","DOI":"10.21437\/Interspeech.2020-1161"},{"issue":"10\u201311","key":"13435_CR13","doi-asserted-by":"publisher","first-page":"847","DOI":"10.1016\/j.specom.2007.01.002","volume":"49","author":"M Gerosa","year":"2007","unstructured":"Gerosa M, Giuliani D, Brugnara F (2007) Acoustic variability and automatic recognition of children\u2019s speech. Speech Comm 49(10\u201311):847\u2013860. https:\/\/doi.org\/10.1016\/j.specom.2007.01.002","journal-title":"Speech Comm"},{"key":"13435_CR14","doi-asserted-by":"publisher","unstructured":"Ghahremani P, BabaAli B, Povey D, Riedhammer K, Trmal J, Khudanpur S (2014) A pitch extraction algorithm tuned for automatic speech recognition. In 2014 IEEE international conference on acoustics, speech and signal processing (ICASSP). IEEE 2494\u20132498. https:\/\/doi.org\/10.1109\/ICASSP.2014.6854049","DOI":"10.1109\/ICASSP.2014.6854049"},{"key":"13435_CR15","doi-asserted-by":"publisher","unstructured":"Goyal K, Singh A, Kadyan V (2021) A comparison of laryngeal effect in the dialects of Punjabi language. J Ambient Intell Human Comput. https:\/\/doi.org\/10.1007\/s12652-021-03235-4","DOI":"10.1007\/s12652-021-03235-4"},{"key":"13435_CR16","doi-asserted-by":"publisher","first-page":"47","DOI":"10.1016\/j.future.2020.11.022","volume":"117","author":"S Hakak","year":"2021","unstructured":"Hakak S, Alazab M, Khan S, Gadekallu TR, Maddikunta PKR, Khan WZ (2021) An ensemble machine learning approach through effective feature extraction to classify fake news. Futur Gener Comput Syst 117:47\u201358","journal-title":"Futur Gener Comput Syst"},{"key":"13435_CR17","unstructured":"Jaitly N, Hinton GE (2013, June) Vocal tract length perturbation (VTLP) improves speech recognition. In Proc. ICML workshop on deep learning for audio, speech and language (Vol. 117)."},{"key":"13435_CR18","doi-asserted-by":"publisher","first-page":"761","DOI":"10.1007\/s10772-017-9446-9","volume":"20","author":"V Kadyan","year":"2017","unstructured":"Kadyan V, Mantri A, Aggarwal RK (2017) A heterogeneous speech feature vectors generation approach with hybrid hmm classifiers. Int J Speech Technol 20:761\u2013769. https:\/\/doi.org\/10.1007\/s10772-017-9446-9","journal-title":"Int J Speech Technol"},{"issue":"1","key":"13435_CR19","doi-asserted-by":"publisher","first-page":"111","DOI":"10.1007\/s10772-018-09577-3","volume":"22","author":"V Kadyan","year":"2019","unstructured":"Kadyan V, Mantri A, Aggarwal RK, Singh A (2019) A comparative study of deep neural network based Punjabi-ASR system. Int J Speech Technol 22(1):111\u2013119. https:\/\/doi.org\/10.1007\/s10772-018-09577-3","journal-title":"Int J Speech Technol"},{"key":"13435_CR20","unstructured":"Kadyan V (2018) Acoustic features optimization for Punjabi automatic speech recognition system. PhD diss. Chitkara University"},{"key":"13435_CR21","doi-asserted-by":"publisher","unstructured":"Kathania HK, Kadiri SR, Alku P, Kurimo M (2020) Study of formant modification for children ASR. In ICASSP 2020\u20132020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE 7429\u20137433. https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053334","DOI":"10.1109\/ICASSP40776.2020.9053334"},{"key":"13435_CR22","doi-asserted-by":"publisher","unstructured":"Kathania HK, Shahnawazuddin S, Adiga N, Ahmad W (2018) Role of prosodic features on children's speech recognition. In 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE 5519\u20135523. https:\/\/doi.org\/10.1109\/ICASSP.2018.8461668","DOI":"10.1109\/ICASSP.2018.8461668"},{"key":"13435_CR23","doi-asserted-by":"crossref","unstructured":"Kaur A, Singh A (2016a) Power-normalized cepstral coefficients (PNCC) for Punjabi automatic speech recognition using phone based modelling in HTK, second international conference on applied and theoretical computing and communication technology. IEEE Explore, ICATCCT2016, Bengaluru.","DOI":"10.1109\/ICATCCT.2016.7912026"},{"key":"13435_CR24","doi-asserted-by":"crossref","unstructured":"Kaur A, Singh A (2016b) Optimizing feature extraction techniques constituting phone based modelling on connected words for Punjabi automatic speech recognition, communicated in 5th International Conference on Advances in Computing, Communications and Informatics, IEEE Explore, ICACCI-2016, Jaipur","DOI":"10.1109\/ICACCI.2016.7732362"},{"key":"13435_CR25","doi-asserted-by":"crossref","unstructured":"Kaur H, Kadyan V. (2020) Feature space discriminatively trained Punjabi children speech recognition system using Kaldi toolkit. Available at SSRN 3565906.","DOI":"10.2139\/ssrn.3565906"},{"key":"13435_CR26","doi-asserted-by":"publisher","unstructured":"Kaur J, Singh A, Kadyan V (2020) Automatic speech recognition system for tonal languages: state-of-the-art survey. Archives of Computational Methods in Engineering:1\u201330. https:\/\/doi.org\/10.1007\/s11831-020-09414-4","DOI":"10.1007\/s11831-020-09414-4"},{"key":"13435_CR27","doi-asserted-by":"crossref","unstructured":"Ko T, Peddinti V, Povey D, Khudanpur S (2015) Audio augmentation for speech recognition. In Sixteenth Annual Conference of the International Speech Communication Association.","DOI":"10.21437\/Interspeech.2015-711"},{"key":"13435_CR28","doi-asserted-by":"publisher","unstructured":"Ko T, Peddinti V, Povey D et al (2017) A study on data augmentation of reverberant speech for robust speech recognition. In 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 5220\u20135224. https:\/\/doi.org\/10.1109\/ICASSP.2017.7953152","DOI":"10.1109\/ICASSP.2017.7953152"},{"key":"13435_CR29","doi-asserted-by":"publisher","first-page":"1617","DOI":"10.1007\/s00500-020-05248-1","volume":"25","author":"Y Kumar","year":"2021","unstructured":"Kumar Y, Singh N, Kumar M, Singh A (2021) AutoSSR: an efficient approach for automatic spontaneous speech recognition model for the Punjabi language. Soft Comput 25:1617\u20131630. https:\/\/doi.org\/10.1007\/s00500-020-05248-1","journal-title":"Soft Comput"},{"key":"13435_CR30","doi-asserted-by":"publisher","unstructured":"Kwon O, Jang I, Ahn C, Kang HG (2019) Emotional speech synthesis based on style embedded Tacotron2 framework. In 2019 34th International Technical Conference on Circuits\/Systems, Computers and Communications (ITC-CSCC). IEEE, 1\u20134. https:\/\/doi.org\/10.1109\/ITC-CSCC.2019.8793393","DOI":"10.1109\/ITC-CSCC.2019.8793393"},{"key":"13435_CR31","unstructured":"Lata S, Arora S (2012, May) Exploratory analysis of Punjabi tones in relation to orthographic characters: a case study. In Workshop on Indian Language and Data: Resources and Evaluation Workshop programme 76"},{"key":"13435_CR32","doi-asserted-by":"publisher","unstructured":"Lata S, Arora S (2013, August) Laryngeal tonal characteristics of Punjabi\u2014an experimental study. In 2013 International Conference on Human Computer Interactions (ICHCI). IEEE, 1\u20136 https:\/\/doi.org\/10.1109\/ICHCI-IEEE.2013.6887793","DOI":"10.1109\/ICHCI-IEEE.2013.6887793"},{"issue":"3","key":"13435_CR33","doi-asserted-by":"publisher","first-page":"1455","DOI":"10.1121\/1.426686","volume":"105","author":"S Lee","year":"1999","unstructured":"Lee S, Potamianos A, Narayanan S (1999) Acoustics of children\u2019s speech: developmental changes of temporal and spectral parameters. The Journal of the Acoustical Society of America 105(3):1455\u20131468. https:\/\/doi.org\/10.1121\/1.426686","journal-title":"The Journal of the Acoustical Society of America"},{"key":"13435_CR34","doi-asserted-by":"crossref","unstructured":"Lei X, Siu M, Hwang MY et al (2006) Improved tone modeling for mandarin broadcast news speech recognition. In Ninth International Conference on Spoken Language Processing","DOI":"10.21437\/Interspeech.2006-372"},{"key":"13435_CR35","doi-asserted-by":"publisher","unstructured":"Li C, Qian Y (2019) Prosody usage optimization for children speech recognition with zero resource children speech. In Interspeech 3446\u20133450. https:\/\/doi.org\/10.21437\/Interspeech.2019-2659","DOI":"10.21437\/Interspeech.2019-2659"},{"key":"13435_CR36","doi-asserted-by":"crossref","unstructured":"Li X, Wu X (2015) Modeling speaker variability using long short-term memory networks for speech recognition. In Sixteenth Annual Conference of the International Speech Communication Association.","DOI":"10.21437\/Interspeech.2015-287"},{"key":"13435_CR37","unstructured":"Litman DJ, Hirschberg JB, Swerts M (2000) Predicting automatic speech recognition performance using prosodic cues, Proc. 1st North Am. Chapter Assoc. Comput. Linguist. Conf. 218\u2013225 [Online]. Available: http:\/\/dl.acm.org\/citation.cfm?id=974305.974334."},{"key":"13435_CR38","doi-asserted-by":"publisher","first-page":"107175","DOI":"10.1016\/j.apacoust.2019.107175","volume":"161","author":"Y Long","year":"2020","unstructured":"Long Y, Li Y, Zhang Q, Wei S, Ye H, Yang J (2020) Acoustic data augmentation for mandarin-English code-switching speech recognition. Appl Acoust 161:107175. https:\/\/doi.org\/10.1016\/j.apacoust.2019.107175","journal-title":"Appl Acoust"},{"issue":"10","key":"13435_CR39","doi-asserted-by":"publisher","first-page":"782","DOI":"10.1016\/j.specom.2008.04.010","volume":"50","author":"L Mary","year":"2008","unstructured":"Mary L, Yegnanarayana B (2008) Extraction and representation of prosodic features for language and speaker recognition. Speech Comm 50(10):782\u2013796. https:\/\/doi.org\/10.1016\/j.specom.2008.04.010","journal-title":"Speech Comm"},{"key":"13435_CR40","unstructured":"Milde B, K\u00f6hn A (2018) Open source automatic speech recognition for German. In Speech Communication; 13th ITG-Symposium 1\u20135 VDE"},{"key":"13435_CR41","doi-asserted-by":"publisher","unstructured":"Nguyen TS, Stueker S, Niehues J, et al (2020) Improving sequence-to-sequence speech recognition training with on-the-fly data augmentation. In ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE. 7689\u20137693 https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9054130","DOI":"10.1109\/ICASSP40776.2020.9054130"},{"issue":"2","key":"13435_CR42","doi-asserted-by":"publisher","first-page":"675","DOI":"10.1007\/s12652-019-01325-y","volume":"11","author":"V Passricha","year":"2020","unstructured":"Passricha V, Aggarwal RK (2020) A comparative analysis of pooling strategies for convolutional neural network based Hindi ASR. J Ambient Intell Humaniz Comput 11(2):675\u2013691. https:\/\/doi.org\/10.1007\/s12652-019-01325-y","journal-title":"J Ambient Intell Humaniz Comput"},{"key":"13435_CR43","unstructured":"Povey D, Ghoshal A, Boulianne G et al(2011) The Kaldi speech recognition toolkit. In IEEE 2011 workshop on automatic speech recognition and understanding (No. CONF). IEEE Signal Processing Society"},{"issue":"8","key":"13435_CR44","first-page":"56","volume":"10","author":"MS Rafi","year":"2010","unstructured":"Rafi MS (2010) Semantic variations of Punjabi Toneme. Lang India 10(8):56\u201365 http:\/\/hdl.handle.net\/123456789\/543","journal-title":"Lang India"},{"key":"13435_CR45","doi-asserted-by":"publisher","unstructured":"Ravinder K (2010) Comparison of hmm and dtw for isolated word recognition system of Punjabi language. In Iberoamerican Congress on Pattern Recognition. Springer, Heidelberg. 244\u2013252 https:\/\/doi.org\/10.1007\/978-3-642-16687-7_35","DOI":"10.1007\/978-3-642-16687-7_35"},{"key":"13435_CR46","doi-asserted-by":"publisher","unstructured":"Rose R, Yin SC, Tang Y (2011) An investigation of subspace modeling for phonetic and speaker variability in automatic speech recognition. In 2011 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE. 4508\u20134511. https:\/\/doi.org\/10.1109\/ICASSP.2011.5947356","DOI":"10.1109\/ICASSP.2011.5947356"},{"key":"13435_CR47","doi-asserted-by":"crossref","unstructured":"Rostami M, Berahmand K, Forouzandeh S (2020) A novel method of constrained feature selection by the measurement of pairwise constraints uncertainty. J Big Data 7(1):1\u201321","DOI":"10.1186\/s40537-020-00352-3"},{"issue":"1","key":"13435_CR48","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s40537-020-00398-3","volume":"8","author":"M Rostami","year":"2021","unstructured":"Rostami M, Berahmand K, Forouzandeh S (2021) A novel community detection based genetic algorithm for feature selection. J Big Data 8(1):1\u201327","journal-title":"J Big Data"},{"issue":"11","key":"13435_CR49","doi-asserted-by":"publisher","first-page":"1749","DOI":"10.1109\/LSP.2017.2756347","volume":"24","author":"S Shahnawazuddin","year":"2017","unstructured":"Shahnawazuddin S, Adiga N, Kathania HK (2017) Effect of prosody modification on children's ASR. IEEE Signal Processing Letters 24(11):1749\u20131753. https:\/\/doi.org\/10.1109\/LSP.2017.2756347","journal-title":"IEEE Signal Processing Letters"},{"key":"13435_CR50","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1016\/j.patrec.2019.12.019","volume":"131","author":"S Shahnawazuddin","year":"2020","unstructured":"Shahnawazuddin S, Adiga N, Kathania HK, Sai BT (2020a) Creating speaker independent ASR system through prosody modification based data augmentation. Pattern Recogn Lett 131:213\u2013218. https:\/\/doi.org\/10.1016\/j.patrec.2019.12.019","journal-title":"Pattern Recogn Lett"},{"key":"13435_CR51","doi-asserted-by":"publisher","unstructured":"Shahnawazuddin S, Adiga N, Kumar K et al (2020b). Voice conversion based data augmentation to improve Children\u2019s speech recognition in limited data scenario. Proc. Interspeech 2020, 4382\u20134386. https:\/\/doi.org\/10.21437\/Interspeech.2020-1112","DOI":"10.21437\/Interspeech.2020-1112"},{"key":"13435_CR52","doi-asserted-by":"publisher","first-page":"34","DOI":"10.1016\/j.dsp.2019.06.015","volume":"93","author":"S Shahnawazuddin","year":"2019","unstructured":"Shahnawazuddin S, Adiga N, Sai BT, Ahmad W, Kathania HK (2019) Developing speaker independent ASR system using limited data through prosody modification based on fuzzy classification of spectral bins. Digital Signal Processing 93:34\u201342. https:\/\/doi.org\/10.1016\/j.dsp.2019.06.015","journal-title":"Digital Signal Processing"},{"key":"13435_CR53","doi-asserted-by":"publisher","unstructured":"Shahnawazuddin S, Ahmad W, Adiga N, Kumar A (2020c,) In-domain and out-of-domain data augmentation to improve Children\u2019s speaker verification system in limited data scenario. In ICASSP 2020-2020 IEEE international conference on acoustics, speech and signal processing (ICASSP). 7554\u20137558. IEEE. https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053891","DOI":"10.1109\/ICASSP40776.2020.9053891"},{"key":"13435_CR54","doi-asserted-by":"publisher","first-page":"103","DOI":"10.1016\/j.specom.2018.11.001","volume":"105","author":"S Shahnawazuddin","year":"2018","unstructured":"Shahnawazuddin S, Kathania HK, Dey A, Sinha R (2018) Improving children\u2019s mismatched ASR using structured low-rank feature projection. Speech Comm 105:103\u2013113. https:\/\/doi.org\/10.1016\/j.specom.2018.11.001","journal-title":"Speech Comm"},{"key":"13435_CR55","doi-asserted-by":"publisher","first-page":"101077","DOI":"10.1016\/j.csl.2020.101077","volume":"63","author":"PG Shivakumar","year":"2020","unstructured":"Shivakumar PG, Georgiou P (2020) Transfer learning from adult to children for speech recognition: evaluation, analysis and recommendations. Comput Speech Lang 63:101077","journal-title":"Comput Speech Lang"},{"key":"13435_CR56","doi-asserted-by":"publisher","unstructured":"Shriberg E, Ferrer L, Kajarekar S et al (2005) Modeling prosodic feature sequences for speaker recognition. Speech Commun 46(3\u20134):455\u2013472. https:\/\/doi.org\/10.1016\/j.specom.2005.02.018","DOI":"10.1016\/j.specom.2005.02.018"},{"key":"13435_CR57","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s10462-019-09775-8","volume":"53","author":"A Singh","year":"2019","unstructured":"Singh A, Kadyan V, Kumar M, Bassan N (2019) ASRoIL: a comprehensive survey for automatic speech recognition of Indian languages. Artif Intell Rev 53:1\u201332. https:\/\/doi.org\/10.1007\/s10462-019-09775-8","journal-title":"Artif Intell Rev"},{"key":"13435_CR58","doi-asserted-by":"publisher","unstructured":"Singh A, Kaur N, Kukreja V et al (2022) Computational intelligence in processing of speech acoustics: a survey. Complex Intell Syst 8(2623):2661 https:\/\/doi.org\/10.1007\/s40747-022-00665-1","DOI":"10.1007\/s40747-022-00665-1"},{"key":"13435_CR59","first-page":"518","volume":"495","author":"D Talkin","year":"1995","unstructured":"Talkin D, Kleijn WB (1995) A robust algorithm for pitch tracking (RAPT). Speech coding and synthesis 495:518","journal-title":"Speech coding and synthesis"},{"key":"13435_CR60","doi-asserted-by":"crossref","unstructured":"Taniya, Bhardwaj V, Kadyan V (2020) Deep neural network trained Punjabi children speech recognition system using Kaldi toolkit. In 2020 IEEE 5th international conference on computing communication and automation (ICCCA) (pp. 374-378). IEEE","DOI":"10.1109\/ICCCA49541.2020.9250780"},{"key":"13435_CR61","doi-asserted-by":"publisher","first-page":"1112","DOI":"10.1016\/j.protcy.2013.12.124","volume":"9","author":"JP Teixeira","year":"2013","unstructured":"Teixeira JP, Oliveira C, Lopes C (2013) Vocal acoustic analysis\u2013jitter, shimmer and hnr parameters. Procedia Technology 9:1112\u20131122. https:\/\/doi.org\/10.1016\/j.protcy.2013.12.124","journal-title":"Procedia Technology"},{"issue":"1\u20132","key":"13435_CR62","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1016\/S0167-6393(02)00083-3","volume":"40","author":"L Ten Bosch","year":"2003","unstructured":"Ten Bosch L (2003) Emotions, speech and the ASR framework. Speech Comm 40(1\u20132):213\u2013225. https:\/\/doi.org\/10.1016\/S0167-6393(02)00083-3","journal-title":"Speech Comm"},{"key":"13435_CR63","doi-asserted-by":"crossref","unstructured":"Wang L, Ambikairajah E, Choi EH (2006) Automatic tonal and non-tonal language classification and language identification using prosodic information. In International Symposium on Chinese Spoken language Processing. (ISCSLP) 485\u2013496","DOI":"10.1109\/ICME.2007.4284659"},{"key":"13435_CR64","doi-asserted-by":"publisher","unstructured":"Wang L, Ambikairajah E, Choi EH (2007a,) A novel method for automatic tonal and non-tonal language classification. In 2007 IEEE International Conference on Multimedia and Expo. IEEE. 352\u2013355. https:\/\/doi.org\/10.1109\/ICME.2007.4284659","DOI":"10.1109\/ICME.2007.4284659"},{"key":"13435_CR65","doi-asserted-by":"crossref","unstructured":"Wang L, Ambikairajah E, Choi EH (2007b) Automatic language recognition with tonal and non-tonal language pre-classification. In 2007 15th European Signal Processing Conference 2375\u20132379. IEEE.","DOI":"10.1109\/ICME.2007.4284659"},{"key":"13435_CR66","doi-asserted-by":"publisher","first-page":"55","DOI":"10.1016\/j.dsp.2018.12.013","volume":"86","author":"IC Yadav","year":"2019","unstructured":"Yadav IC, Shahnawazuddin S, Pradhan G (2019) Addressing noise and pitch sensitivity of speech recognition system through variational mode decomposition based spectral smoothing. Digital Signal Processing 86:55\u201364. https:\/\/doi.org\/10.1016\/j.dsp.2018.12.013","journal-title":"Digital Signal Processing"},{"key":"13435_CR67","doi-asserted-by":"publisher","unstructured":"Yeung G, Alwan A (2018) On the difficulties of automatic speech recognition for kindergarten-aged children. In INTERSPEECH 1661\u20131665. https:\/\/doi.org\/10.21437\/Interspeech.2018-2297","DOI":"10.21437\/Interspeech.2018-2297"},{"key":"13435_CR68","doi-asserted-by":"crossref","unstructured":"Zehra W, Javed AR, Jalil Z et al (2021) Cross corpus multi-lingual speech emotion recognition using ensemble learning. Complex and Intelligent Systems 7:1\u201310","DOI":"10.1007\/s40747-020-00250-4"},{"key":"13435_CR69","doi-asserted-by":"publisher","unstructured":"Zhang JS, Hirose K (2000) Anchoring hypothesis and its application to tone recognition of Chinese continuous speech. In 2000 IEEE International Conference on Acoustics, Speech, and Signal Processing. Proceedings (Cat. No. 00CH37100). IEEE. 3:1419\u20131422. https:\/\/doi.org\/10.1109\/ICASSP.2000.861859","DOI":"10.1109\/ICASSP.2000.861859"},{"key":"13435_CR70","doi-asserted-by":"publisher","unstructured":"Zhao X, Wang D (2013) Analyzing noise robustness of MFCC and GFCC features in speaker identification. In 2013 IEEE international conference on acoustics, speech and signal processing 7204\u20137208. IEEE. https:\/\/doi.org\/10.1109\/ICASSP.2013.6639061","DOI":"10.1109\/ICASSP.2013.6639061"},{"key":"13435_CR71","doi-asserted-by":"publisher","unstructured":"Zhu W, O'Shaughnessy D (2004) Incorporating frequency masking filtering in a standard MFCC feature extraction algorithm. In Proceedings 7th International Conference on Signal Processing, 2004. Proceedings. ICSP'04. 2004. IEEE. 1:617\u2013620. https:\/\/doi.org\/10.1109\/ICOSP.2004.1452739","DOI":"10.1109\/ICOSP.2004.1452739"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-022-13435-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-022-13435-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-022-13435-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,4]],"date-time":"2023-01-04T09:42:39Z","timestamp":1672825359000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-022-13435-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,7,20]]},"references-count":71,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2023,1]]}},"alternative-id":["13435"],"URL":"https:\/\/doi.org\/10.1007\/s11042-022-13435-5","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"type":"print","value":"1380-7501"},{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2022,7,20]]},"assertion":[{"value":"19 April 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 April 2022","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 July 2022","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 July 2022","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"We have no conflict of interest to declare.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}