{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,24]],"date-time":"2025-05-24T05:06:51Z","timestamp":1748063211214,"version":"3.37.3"},"reference-count":48,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2023,9,1]],"date-time":"2023-09-01T00:00:00Z","timestamp":1693526400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,9,1]],"date-time":"2023-09-01T00:00:00Z","timestamp":1693526400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2023,9]]},"DOI":"10.1007\/s10772-023-10039-8","type":"journal-article","created":{"date-parts":[[2023,9,8]],"date-time":"2023-09-08T14:03:14Z","timestamp":1694181794000},"page":"651-663","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":12,"title":["Towards modeling raw speech in gender identification of children using sincNet over ERB scale"],"prefix":"10.1007","volume":"26","author":[{"given":"Kodali","family":"Radha","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0524-8251","authenticated-orcid":false,"given":"Mohan","family":"Bansal","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,9,8]]},"reference":[{"key":"10039_CR48","doi-asserted-by":"crossref","unstructured":"Alashban, A. A., & Alotaibi, Y. A. (2021). Speaker gender classification in mono-language and cross-language using BLSTM network. In: 2021 44th International conference on telecommunications and signal processing(TSP), (pp. 66\u201371). IEEEEE","DOI":"10.1109\/TSP52935.2021.9522623"},{"key":"10039_CR33","doi-asserted-by":"crossref","unstructured":"Alnuaim, A. A., Zakariah, M., Shashidhar, C., Hatamleh, W. A., Tarazi, H., Shukla, P. K., & Ratna, R. (2022). Speaker gender recognition based on deep neural networks and ResNet50. Wireless Communications and Mobile Computing 2022.","DOI":"10.1155\/2022\/4444388"},{"key":"10039_CR11","first-page":"12449","volume":"33","author":"A Baevski","year":"2020","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., & Auli, M. (2020). Wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in Neural Information Processing Systems, 33, 12449\u201312460.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10039_CR7","doi-asserted-by":"crossref","unstructured":"Bansal, M., & Sircar, P. (2019). Phoneme based model for gender identification and adult-child classification. In: 2019 13th\nInternational conference on signal processing and communication systems (ICSPCS), (pp. 1\u20137). IEEE.","DOI":"10.1109\/ICSPCS47537.2019.9008704"},{"key":"10039_CR2","unstructured":"Batliner, A., Hacker, C., Steidl, S., N\u00f6th, E., D'Arcy, S., Russell, M. J., & Wong, M. (2004).  You stupid tin box-children interacting with the aibo robot: A cross-linguistic emotional speech corpus."},{"key":"10039_CR27","doi-asserted-by":"publisher","first-page":"367","DOI":"10.1007\/s10772-021-09808-0","volume":"24","author":"KB Bhangale","year":"2021","unstructured":"Bhangale, K. B., & Mohanaprasad, K. (2021). A review on speech processing using machine learning paradigm. International Journal of Speech Technology, 24, 367\u2013388.","journal-title":"International Journal of Speech Technology"},{"key":"10039_CR24","doi-asserted-by":"crossref","unstructured":"Bhattacharya, G., Alam, M. J., & Kenny, P. (2017, August). Deep Speaker Embeddings for Short-Duration Speaker Verification. In Interspeech, (pp. 1517\u20131521).","DOI":"10.21437\/Interspeech.2017-1575"},{"issue":"12","key":"10039_CR21","doi-asserted-by":"publisher","first-page":"1750041","DOI":"10.1142\/S0218001417500410","volume":"31","author":"G Chaudhary","year":"2017","unstructured":"Chaudhary, G., Srivastava, S., & Bhardwaj, S. (2017). Feature extraction methods for speaker recognition: A review. International Journal of Pattern Recognition and Artificial Intelligence, 31(12), 1750041.","journal-title":"International Journal of Pattern Recognition and Artificial Intelligence"},{"key":"10039_CR15","doi-asserted-by":"crossref","unstructured":"Fainberg, J., Klejch, O., Loweimi, E., Bell, P., & Renals, S. (2019). Acoustic model adaptation from raw waveforms with SincNet. In 2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), (pp. 897\u2013904). IEEE","DOI":"10.1109\/ASRU46091.2019.9003974"},{"issue":"3","key":"10039_CR26","doi-asserted-by":"publisher","first-page":"543","DOI":"10.1007\/s10772-017-9424-2","volume":"20","author":"S Gautam","year":"2017","unstructured":"Gautam, S., & Singh, L. (2017). Development of spectro-temporal features of speech in children. International Journal of Speech Technology, 20(3), 543\u2013551.","journal-title":"International Journal of Speech Technology"},{"issue":"1","key":"10039_CR18","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s12046-018-1028-2","volume":"44","author":"S Gautam","year":"2019","unstructured":"Gautam, S., & Singh, L. (2019). The development of spectral features in the speech of Indian children. S\u0101dhan\u0101, 44(1), 1\u20137.","journal-title":"S\u0101dhan\u0101"},{"issue":"35","key":"10039_CR10","doi-asserted-by":"publisher","first-page":"1950438","DOI":"10.1142\/S0217984919504384","volume":"33","author":"M Gupta","year":"2019","unstructured":"Gupta, M., Bharti, S. S., & Agarwal, S. (2019). Gender-based speaker recognition from speech signals using gmm model. Modern Physics Letters B, 33(35), 1950438.","journal-title":"Modern Physics Letters B"},{"key":"10039_CR38","unstructured":"Ioffe, S., & Szegedy, C. (2015). Batch normalization: Accelerating deep network training by reducing internal covariate shift. In International conference on machine learning pmlr, (pp. 448\u2013456)."},{"key":"10039_CR29","doi-asserted-by":"crossref","unstructured":"Jung, J. W., Heo, H. S., Yang, I. H., Shim, H. J., & Yu, H. J. (2018). A complete end-to-end speaker verification system using deep neural networks: From raw signals to verification result. In 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP),  (pp. 5349\u20135353). IEEE","DOI":"10.1109\/ICASSP.2018.8462575"},{"issue":"12","key":"10039_CR30","first-page":"23","volume":"8","author":"J-W Jung","year":"2018","unstructured":"Jung, J.-W., Heo, H.-S., Yang, I., Shim, H.-J., & Yu, H.-J. (2018). Avoiding speaker overfitting in end-to-end DNNs using raw waveform for text-independent speaker verification. Extraction, 8(12), 23\u201324.","journal-title":"Extraction"},{"key":"10039_CR46","doi-asserted-by":"crossref","unstructured":"Kabil, S. H., Muckenhirn, H., & Magimai-Doss, M. (2018). On learning to identify genders from raw speech signal using CNNs. In: Interspeech, (pp. 287\u2013291).","DOI":"10.21437\/Interspeech.2018-1240"},{"issue":"3","key":"10039_CR8","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s12046-021-01649-6","volume":"46","author":"V Karthikeyan","year":"2021","unstructured":"Karthikeyan, V., & Suja Priyadharsini, S. (2021). A strong hybrid adaboost classification algorithm for speaker recognition. S\u0101dhan\u0101, 46(3), 1\u201319.","journal-title":"S\u0101dhan\u0101"},{"key":"10039_CR41","unstructured":"Kingma, D.P., & Ba, J. (2014). Adam: A method for stochastic optimization. arXiv preprint http:\/\/arXiv.org\/1412.6980"},{"key":"10039_CR31","unstructured":"Krishna, D., Amrutha, D., Reddy, S. S., Acharya, A., Garapati, P. A., & Triveni, B. (2020). Language independent gender identification from raw waveform using multi-scale convolutional neural networks. In: 2020 IEEE international conference on acoustics, speech and signal processing (ICASSP), (pp. 6559\u20136563). IEEE."},{"key":"10039_CR32","doi-asserted-by":"crossref","unstructured":"Lebourdais, M., Tahon, M., Laurent, A., & Meignier, S. (2022). Overlapped speech and gender detection with WavLM pre-trained features. arXiv preprint http:\/\/arXiv.org\/2209.04167","DOI":"10.21437\/Interspeech.2022-10825"},{"key":"10039_CR43","doi-asserted-by":"crossref","unstructured":"Loweimi, E., Bell, P., & Renals, S. (2019). On learning interpretable CNNs with parametric modulated kernel-based filters. In: Interspeech, (pp. 3480\u20133484).","DOI":"10.21437\/Interspeech.2019-1257"},{"key":"10039_CR39","unstructured":"Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities improve neural network acoustic models. In Proceedings of the international conference on machine learning (ICML), icml 30, p. 3. Citeseer"},{"issue":"8","key":"10039_CR45","doi-asserted-by":"publisher","first-page":"2581","DOI":"10.1007\/s00521-017-2848-4","volume":"30","author":"AA Mallouh","year":"2018","unstructured":"Mallouh, A. A., Qawaqneh, Z., & Barkana, B. D. (2018). New transformed features generated by deep bottleneck extractor and a GMM-UBM classifier for speaker age and gender classification. Neural Computing and Applications, 30(8), 2581\u20132593.","journal-title":"Neural Computing and Applications"},{"issue":"3","key":"10039_CR40","doi-asserted-by":"publisher","first-page":"750","DOI":"10.1121\/1.389861","volume":"74","author":"BC Moore","year":"1983","unstructured":"Moore, B. C., & Glasberg, B. R. (1983). Suggested formulae for calculating auditory-filter bandwidths and excitation patterns. The Journal of the Acoustical Society of America, 74(3), 750\u2013753.","journal-title":"The Journal of the Acoustical Society of America"},{"key":"10039_CR28","doi-asserted-by":"crossref","unstructured":"Muckenhirn, H., Doss, M.M.-, & Marcell, S. (2018). Towards directly modeling raw speech signal for speaker verification using CNNs. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), (pp. 4884\u20134888). IEEE.","DOI":"10.1109\/ICASSP.2018.8462165"},{"key":"10039_CR12","doi-asserted-by":"crossref","unstructured":"Pariente, M., Cornell, S., Deleforge, A., & Vincent, E. (2020). Filterbank design for end-to-end speech separation. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), (pp. 6364\u20136368). IEEE.","DOI":"10.1109\/ICASSP40776.2020.9053038"},{"key":"10039_CR44","doi-asserted-by":"crossref","unstructured":"Peic Tukuljac, H., Ricaud, B., Aspert, N., & Colbois, L. (2022). Learnable filter-banks for CNN-based audio applications. In:\nProceedings of the northern Lights Deep Learning Workshop 2022.","DOI":"10.7557\/18.6279"},{"key":"10039_CR37","volume-title":"Theory and applications of digital speech processing","author":"L Rabiner","year":"2010","unstructured":"Rabiner, L., & Schafer, R. (2010). Theory and applications of digital speech processing. Prentice Hall Press."},{"key":"10039_CR36","doi-asserted-by":"publisher","DOI":"10.34740\/KAGGLE\/DSV\/4416485","author":"K Radha","year":"2022","unstructured":"Radha, K., & Bansal, M. (2022). Non-native children english speech (NNCES) corpus. Kaggle. https:\/\/doi.org\/10.34740\/KAGGLE\/DSV\/4416485","journal-title":"Kaggle"},{"key":"10039_CR22","doi-asserted-by":"publisher","DOI":"10.1007\/s00034-023-02399-y","author":"K Radha","year":"2023","unstructured":"Radha, K., & Bansal, M. (2023). Feature fusion and ablation analysis in gender identification of preschool children from spontaneous speech. Circuits Systems and Signal Processing. https:\/\/doi.org\/10.1007\/s00034-023-02399-y","journal-title":"Circuits Systems and Signal Processing"},{"issue":"10","key":"10039_CR4","doi-asserted-by":"publisher","first-page":"1490","DOI":"10.3390\/e24101490","volume":"24","author":"K Radha","year":"2022","unstructured":"Radha, K., & Bansal, M. (2022). Audio augmentation for non-native children\u2019s speech recognition through discriminative learning. Entropy, 24(10), 1490.","journal-title":"Entropy"},{"issue":"3","key":"10039_CR5","doi-asserted-by":"publisher","first-page":"1375","DOI":"10.1007\/s41870-023-01224-8","volume":"15","author":"K Radha","year":"2023","unstructured":"Radha, K., & Bansal, M. (2023). Closed-set automatic speaker identification using multi-scale recurrent networks in non-native children. International Journal of Information Technology, 15(3), 1375\u20131385.","journal-title":"International Journal of Information Technology"},{"key":"10039_CR3","doi-asserted-by":"crossref","unstructured":"Radha, K., Bansal, M., & Shabber, S. M. (2022). Accent classification of native and non-native children using harmonic pitch. In 2022 2nd International Conference on Artificial Intelligence and Signal Processing (AISP),  (pp. 1\u20136). IEEE.","DOI":"10.1109\/AISP53593.2022.9760588"},{"key":"10039_CR6","doi-asserted-by":"crossref","unstructured":"Radha, K., Bansal, M., & Sharma, R. (2023). Whitening Transformation of i-vectors in Closed-Set Speaker Verification of Children. In 2023 10th International Conference on Signal Processing and Integrated Networks (SPIN) (pp. 243\u2013248). IEEE.","DOI":"10.1109\/SPIN57001.2023.10116604"},{"key":"10039_CR16","unstructured":"Ravanelli, M., & Bengio, Y. (2018). Interpretable convolutional filters with sincnet. arXiv preprint http:\/\/arXiv:1811.09725."},{"key":"10039_CR14","doi-asserted-by":"crossref","unstructured":"Ravanelli, M., & Bengio, Y. (2018). Speaker recognition from raw waveform with sincnet. In 2018 IEEE spoken language technology workshop (SLT)(pp. 1021\u20131028), IEEE.","DOI":"10.1109\/SLT.2018.8639585"},{"issue":"5","key":"10039_CR25","doi-asserted-by":"publisher","first-page":"783","DOI":"10.1007\/s12046-011-0047-z","volume":"36","author":"KS Rao","year":"2011","unstructured":"Rao, K. S. (2011). Role of neural network models for developing speech systems. Sadhana, 36(5), 783\u2013836.","journal-title":"Sadhana"},{"key":"10039_CR42","unstructured":"Raschka, S. (2014). An overview of general performance metrics of binary classifier systems. arXiv preprint http:\/\/arXiv:1410.5330"},{"key":"10039_CR20","doi-asserted-by":"crossref","unstructured":"Richardson, F., Reynolds, D., & Dehak, N. (2015). A unified deep neural network for speaker and language recognition. arXiv preprint http\/\/arXiv:1504.00923.","DOI":"10.21437\/Interspeech.2015-299"},{"issue":"2","key":"10039_CR17","doi-asserted-by":"publisher","first-page":"521","DOI":"10.1093\/ajcn\/72.2.521S","volume":"72","author":"AD Rogol","year":"2000","unstructured":"Rogol, A. D., Clark, P. A., & Roemmich, J. N. (2000). Growth and pubertal development in children and adolescents: Effects of diet and physical activity. The American Journal of Clinical Nutrition, 72(2), 521\u2013528.","journal-title":"The American Journal of Clinical Nutrition"},{"key":"10039_CR47","doi-asserted-by":"crossref","unstructured":"Sarma, M., Sarma, K. K., & Goel, N. K. (2020). Children\u2019s age and gender recognition from raw speech waveform using DNN. In Advances in Intelligent Computing and Communication: Proceedings of ICAC 2019, (pp. 1\u20139). Springer.","DOI":"10.1007\/978-981-15-2774-6_1"},{"key":"10039_CR34","doi-asserted-by":"publisher","first-page":"223","DOI":"10.1007\/s10772-020-09680-4","volume":"23","author":"M Sarma","year":"2020","unstructured":"Sarma, M., Sarma, K. K., & Goel, N. K. (2020). Multi-task learning DNN to improve gender identification from speech leveraging age information of the speaker. International Journal of Speech Technology, 23, 223\u2013240.","journal-title":"International Journal of Speech Technology"},{"key":"10039_CR9","doi-asserted-by":"publisher","first-page":"141","DOI":"10.1016\/j.csl.2018.01.001","volume":"50","author":"S Safavi","year":"2018","unstructured":"Safavi, S., Russell, M., & Jan\u010dovi\u010d, P. (2018). Automatic speaker, age-group and gender identification from children\u2019s speech. Computer Speech Language, 50, 141\u2013156.","journal-title":"Computer Speech Language"},{"key":"10039_CR1","doi-asserted-by":"crossref","unstructured":"Schuller, B., Steidl, S., Batliner, A., Burkhardt, F., Devillers, L., M\u00fcller, C., & Narayanan, S. (2010). The INTERSPEECH 2010 paralinguistic challenge. In Proceedings of INTERSPEECH 2010, Makuhari, (pp. 4052\u20134056). IEEE.","DOI":"10.21437\/Interspeech.2010-739"},{"key":"10039_CR35","unstructured":"Schwoebel, J. Survey Lex. https:\/\/www.surveylex.com\/. Accessed: 2022-01-01"},{"key":"10039_CR19","doi-asserted-by":"crossref","unstructured":"Variani, E., Lei, X., McDermott, E., Moreno, I. L., & Gonzalez-Dominguez, J. (2014). Deep neural networks for small footprint text-dependent speaker verification. In 2014 IEEE international conference on acoustics, speech and signal processing (ICASSP), (pp. 4052\u20134056). IEEE.","DOI":"10.1109\/ICASSP.2014.6854363"},{"key":"10039_CR13","doi-asserted-by":"crossref","unstructured":"Zhu, G., Jiang, F., & Duan, Z. (2020). Y-vector: Multiscale waveform encoder for speaker embedding. arXiv preprint http:\/\/arXiv.org\/2010.12951","DOI":"10.21437\/Interspeech.2021-1707"},{"issue":"9","key":"10039_CR23","doi-asserted-by":"publisher","first-page":"1633","DOI":"10.1109\/TASLP.2018.2831456","volume":"26","author":"C Zhang","year":"2018","unstructured":"Zhang, C., Koishida, K., & Hansen, J. H. (2018). Text-independent speaker verification based on triplet convolutional neural network embeddings. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 26(9), 1633\u20131644.","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-023-10039-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10772-023-10039-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-023-10039-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,10]],"date-time":"2023-11-10T14:10:21Z","timestamp":1699625421000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10772-023-10039-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,9]]},"references-count":48,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2023,9]]}},"alternative-id":["10039"],"URL":"https:\/\/doi.org\/10.1007\/s10772-023-10039-8","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"type":"print","value":"1381-2416"},{"type":"electronic","value":"1572-8110"}],"subject":[],"published":{"date-parts":[[2023,9]]},"assertion":[{"value":"15 February 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 August 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 September 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of interest"}}]}}