{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,26]],"date-time":"2025-11-26T15:54:48Z","timestamp":1764172488813,"version":"3.37.3"},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2021,8,30]],"date-time":"2021-08-30T00:00:00Z","timestamp":1630281600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2021,8,30]],"date-time":"2021-08-30T00:00:00Z","timestamp":1630281600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2023,9]]},"DOI":"10.1007\/s10772-021-09888-y","type":"journal-article","created":{"date-parts":[[2021,8,30]],"date-time":"2021-08-30T15:03:53Z","timestamp":1630335833000},"page":"579-587","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["A deep learning approach for robust speaker identification using chroma energy normalized statistics and mel frequency cepstral coefficients"],"prefix":"10.1007","volume":"26","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0418-4057","authenticated-orcid":false,"given":"J. V. Thomas","family":"Abraham","sequence":"first","affiliation":[]},{"given":"A. Nayeemulla","family":"Khan","sequence":"additional","affiliation":[]},{"given":"A.","family":"Shahina","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,8,30]]},"reference":[{"key":"9888_CR1","first-page":"705","volume":"7","author":"JVT Abraham","year":"2019","unstructured":"Abraham, J. V. T., Shahina, A., & Khan, A. N. (2019). Enhancing noisy speech using WEMD. International Journal of Recent Technology and Engineering, 7, 705\u2013708.","journal-title":"International Journal of Recent Technology and Engineering"},{"key":"9888_CR2","doi-asserted-by":"publisher","first-page":"143","DOI":"10.3390\/app6050143","volume":"6","author":"F Alias","year":"2016","unstructured":"Alias, F., Carri\u00e9, J. C., & Sevillano, X. (2016). A review of physical and perceptual feature extraction techniques for speech, music and environmental sounds. Applied Sciences, 6, 143.","journal-title":"Applied Sciences"},{"key":"9888_CR3","doi-asserted-by":"crossref","unstructured":"Arsikere, H., An, H., & Alwan, A. (2014). Speaker recognition via fusion of subglottal features and MFCCs. In INTERSPEECH 2014.","DOI":"10.21437\/Interspeech.2014-284"},{"key":"9888_CR4","doi-asserted-by":"publisher","first-page":"96","DOI":"10.1109\/TMM.2004.840597","volume":"7","author":"M Bartsch","year":"2005","unstructured":"Bartsch, M., & Wakefield, G. (2005). Audio thumbnailing of popular music using chroma-based representations. IEEE Transactions on Multimedia, 7, 96\u2013104.","journal-title":"IEEE Transactions on Multimedia"},{"key":"9888_CR5","doi-asserted-by":"crossref","unstructured":"Bell, P., Gales, M. J. F., Hain, T., Kilgour, J., Lanchantin, P., Liu, X., McParland, A., Renals, S., Saz, O., Wester, M., & Woodland, P. C. (2015). The MGB challenge: Evaluating multi-genre broadcast media recognition. In IEEE workshop on automatic speech recognition and understanding (ASRU) (pp.\u00a0687\u2013693).","DOI":"10.1109\/ASRU.2015.7404863"},{"key":"9888_CR6","doi-asserted-by":"crossref","unstructured":"Campbell, J., Reynolds, D., & Dunn, R. (2003). Fusing high- and low-level features for speaker recognition. In In INTERSPEECH (pp.\u00a02665\u20132668).","DOI":"10.21437\/Eurospeech.2003-727"},{"key":"9888_CR7","doi-asserted-by":"publisher","first-page":"210","DOI":"10.1016\/j.csl.2005.06.003","volume":"20","author":"W Campbell","year":"2006","unstructured":"Campbell, W., Campbell, J., Reynolds, D., Singer, E., & Torres-Carrasquillo, P. (2006). Support vector machines for speaker and language recognition. Computer Speech & Language, 20, 210\u2013229.","journal-title":"Computer Speech & Language"},{"key":"9888_CR8","doi-asserted-by":"crossref","unstructured":"Chang, J., & Wang, D. (2017). Robust speaker recognition based on DNN\/i-vectors and speech separation. In IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp.\u00a05415\u20135419).","DOI":"10.1109\/ICASSP.2017.7953191"},{"key":"9888_CR9","doi-asserted-by":"publisher","first-page":"1616","DOI":"10.1109\/TIFS.2019.2941773","volume":"15","author":"A Chowdhury","year":"2020","unstructured":"Chowdhury, A., & Ross, A. (2020). Fusing mfcc and lpc features using 1d triplet cnn for speaker recognition in severely degraded audio signals. IEEE Transactions on Information Forensics and Security, 15, 1616\u20131629.","journal-title":"IEEE Transactions on Information Forensics and Security"},{"key":"9888_CR17","unstructured":"Convolutional Neural Networks. (2018). https:\/\/www.datasciencecentral.com\/profiles\/blogs\/understanding-neural-networks-from-neuron-to-rnn-cnn-and-deep."},{"key":"9888_CR10","doi-asserted-by":"crossref","unstructured":"Dehak, N., Dehak, R., Kenny, P., Brummer, N., Ouellet, P., & Dumouchel, P. (2009). Support vector machines versus fast scoring in the low-dimensional total variability space for speaker verification. Proceedings of the annual conference of the international speech communication association, INTERSPEECH (vol. 1, pp. 1559\u20131562).","DOI":"10.21437\/Interspeech.2009-385"},{"issue":"1","key":"9888_CR11","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1007\/s10772-013-9205-5","volume":"17","author":"MAA El-Fattah","year":"2014","unstructured":"El-Fattah, M. A. A., Dessouky, M. I., Abbas, A. M., Diab, S. M., El-Rabaie, E.-S.M., Al-Nuaimy, W., et al. (2014). Speech enhancement with an adaptive wiener filter. International Journal of Speech Technology, 17(1), 53\u201364.","journal-title":"International Journal of Speech Technology"},{"key":"9888_CR12","doi-asserted-by":"crossref","unstructured":"Friedland, G., Vinyals, O., Huang, C., & M\u00fcller, C. (2009). Fusing short term and long term features for improved speaker diarization. In IEEE international conference on acoustics, speech and signal processing (pp.\u00a04077\u20134080).","DOI":"10.1109\/ICASSP.2009.4960524"},{"key":"9888_CR13","first-page":"27403","volume":"93","author":"J Garofolo","year":"1993","unstructured":"Garofolo, J., Lamel, L., Fisher, W., Fiscus, J., & Pallett, D. (1993). DARPA TIMIT acoustic-phonetic continous speech corpus CD-ROM. NIST speech disc 1\u20131.1. NASA STI\/Recon Technical Report, 93, 27403.","journal-title":"NASA STI\/Recon Technical Report"},{"issue":"4","key":"9888_CR14","doi-asserted-by":"publisher","first-page":"EL420","DOI":"10.1121\/1.4979841","volume":"141","author":"J Guo","year":"2017","unstructured":"Guo, J., Yang, R., Arsikere, H., & Alwan, A. (2017). Robust speaker identification via fusion of subglottal resonances and cepstral features. The Journal of the Acoustical Society of America, 141(4), EL420\u2013EL426.","journal-title":"The Journal of the Acoustical Society of America"},{"key":"9888_CR15","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In IEEE conference on computer vision and pattern recognition (CVPR) (pp.\u00a0770\u2013778).","DOI":"10.1109\/CVPR.2016.90"},{"key":"9888_CR16","doi-asserted-by":"publisher","first-page":"1091","DOI":"10.1109\/ICASSP.1997.596131","volume":"2","author":"J He","year":"1997","unstructured":"He, J., Liu, L., & Palm, G. (1997). A new codebook training algorithm for VQ-based speaker recognition. IEEE International Conference on Acoustics, Speech, and Signal Processing, 2, 1091\u20131094.","journal-title":"IEEE International Conference on Acoustics, Speech, and Signal Processing"},{"key":"9888_CR18","doi-asserted-by":"crossref","unstructured":"Janin, A., Baron, D., Edwards, J., Ellis, D., Gelbart, D., Morgan, N., Peskin, B., Pfau, T., Shriberg, E., Stolcke, A., & Wooters, C. (2003). The ICSI meeting corpus. In IEEE international conference on acoustics, speech, and signal processing (vol.\u00a01).","DOI":"10.1109\/ICASSP.2003.1198793"},{"key":"9888_CR19","doi-asserted-by":"crossref","unstructured":"Kanagasundaram, A., Vogt, R., Dean, D., Sridharan, S., & Mason, M. (2011). i-vector based speaker recognition on short utterances. In Proceedings of the annual conference of the international speech communication association, INTERSPEECH.","DOI":"10.21437\/Interspeech.2011-58"},{"key":"9888_CR20","doi-asserted-by":"publisher","first-page":"12","DOI":"10.1016\/j.specom.2009.08.009","volume":"52","author":"T Kinnunen","year":"2010","unstructured":"Kinnunen, T., & Li, H. (2010). An overview of text-independent speaker recognition: From features to supervectors. Speech Communication, 52, 12\u201340.","journal-title":"Speech Communication"},{"key":"9888_CR21","doi-asserted-by":"crossref","unstructured":"Lawson, A., Vabishchevich, P., Huggins, M., Ardis, P., Battles, B., & Stauffer, A. (2011) Survey and evaluation of acoustic features for speaker recognition. In IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp.\u00a05444\u20135447).","DOI":"10.1109\/ICASSP.2011.5947590"},{"key":"9888_CR22","doi-asserted-by":"crossref","unstructured":"Lei, Y., Burget, L., & Scheffer, N. (2013). A noise robust i-vector extractor using vector taylor series for speaker recognition. In IEEE international conference on acoustics, speech and signal processing (pp.\u00a06788\u20136791).","DOI":"10.1109\/ICASSP.2013.6638976"},{"key":"9888_CR23","unstructured":"McCool, C., Marcel, S., & \u201dMOBIO Database for the ICPR . (2010). Face and Speech Competition. Idiap-Com Idiap-Com-02-2009. Idiap, 11, 2009."},{"key":"9888_CR24","unstructured":"Mccowan, I., Carletta, J., Kraaij, W., Ashby, S., Bourban, S., Flynn, M., Guillemot, M., Hain, T., Kadlec, J., Karaiskos, V., Kronenthal, M., Lathoud, G., Lincoln, M., Masson, A. Lisowska., Post, W., Reidsma, D., & Wellner, P. (2005). The AMI meeting corpus. In International conference on methods and techniques in behavioral research."},{"key":"9888_CR25","doi-asserted-by":"crossref","unstructured":"Millar, J. B., Vonwiller, J. P., Harrington, J. M., & Dermody, P. J. (1994). \u201cThe Australian National Database of Spoken Language. In Proceedings of IEEE international conference on acoustics, speech and signal processing (vol.\u00a0i, pp.\u00a0I\/97\u2013I100).","DOI":"10.1109\/ICASSP.1994.389346"},{"key":"9888_CR26","doi-asserted-by":"publisher","first-page":"119","DOI":"10.1016\/j.specom.2016.07.006","volume":"85","author":"GS Morrison","year":"2016","unstructured":"Morrison, G. S., & Enzinger, E. (2016). Multi-laboratory evaluation of forensic voice comparison systems under conditions reflecting those of a real forensic case (forensic\\_eval\\_01) introduction. Speech Communication, 85, 119\u2013126.","journal-title":"Speech Communication"},{"key":"9888_CR27","unstructured":"M\u00fcller, M., Kurth, F., & Clausen, M. (2005). Audio matching via chroma-based statistical features. In 6th International conference on music information retrieval, ISMIR (pp.\u00a0288\u2013295)."},{"key":"9888_CR28","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Chung, J. S., & Zisserman, A. (2017). VoxCeleb: a large-scale speaker identification dataset. In INTERSPEECH.","DOI":"10.21437\/Interspeech.2017-950"},{"key":"9888_CR29","doi-asserted-by":"publisher","first-page":"265","DOI":"10.1016\/S0167-6393(99)00082-5","volume":"31","author":"D Petrovska-Delacr\u00e9taz","year":"2000","unstructured":"Petrovska-Delacr\u00e9taz, D., Hennebert, J., Melin, H., & Genoud, D. (June 2000). POLYCOST: A telephone-speech database for speaker recognition. Speech Communication, 31, 265\u2013270.","journal-title":"Speech Communication"},{"key":"9888_CR30","doi-asserted-by":"crossref","unstructured":"Prince, S. J. D., & Elder, J. H. (2007). Probabilistic linear discriminant analysis for inferences about identity. In IEEE 11th international conference on computer vision (pp.\u00a01\u20138).","DOI":"10.1109\/ICCV.2007.4409052"},{"key":"9888_CR31","doi-asserted-by":"publisher","first-page":"72","DOI":"10.1109\/89.365379","volume":"3","author":"D Reynolds","year":"1995","unstructured":"Reynolds, D., & Rose, R. (1995). Robust text-independent speaker identification using Gaussian Mixture speaker models. IEEE Transactions on Speech and Audio Processing, 3, 72\u201383.","journal-title":"IEEE Transactions on Speech and Audio Processing"},{"key":"9888_CR32","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/LSP.2015.2420092","volume":"22","author":"F Richardson","year":"2015","unstructured":"Richardson, F., Reynolds, D., & Dehak, N. (2015). Deep neural network approaches to speaker and language recognition. IEEE Signal Processing Letters, 22, 1\u20131.","journal-title":"IEEE Signal Processing Letters"},{"key":"9888_CR33","doi-asserted-by":"crossref","unstructured":"Sell, G., & Clark, P. (2014). Music tonality features for speech\/music discrimination. In IEEE international conference on acoustics (pp. 2489\u20132493). ICASSP: Speech and Signal Processing\u2014Proceedings.","DOI":"10.1109\/ICASSP.2014.6854048"},{"key":"9888_CR34","unstructured":"Simonyan, K., & Zisserman, A. (2014). Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556."},{"key":"9888_CR35","doi-asserted-by":"crossref","unstructured":"Snyder, D., Garcia-Romero, D., Sell, G., Povey, D., & Khudanpur, S. (2018). X-vectors: Robust DNN embeddings for speaker recognition. In IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp.\u00a05329\u20135333).","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"9888_CR36","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Ioffe, S., & Vanhoucke, V. (2016). Inception-v4, inception-resnet and the impact of residual connections on learning. CoRR, vol.\u00a0abs\/1602.07261.","DOI":"10.1609\/aaai.v31i1.11231"},{"issue":"1","key":"9888_CR37","doi-asserted-by":"publisher","first-page":"6","DOI":"10.1109\/LSP.2015.2495102","volume":"23","author":"R Tavares","year":"2016","unstructured":"Tavares, R., & Coelho, R. (2016). Speech enhancement with nonstationary acoustic noise detection in time domain. IEEE Signal Processing Letters, 23(1), 6\u201310.","journal-title":"IEEE Signal Processing Letters"},{"key":"9888_CR38","doi-asserted-by":"crossref","unstructured":"Torfi, A., Dawson, J., & Nasrabadi, N. M.(2018). Text-independent speaker verification using 3D Convolutional Neural Networks. In IEEE international conference on multimedia and expo (ICME) (pp.\u00a01\u20136).","DOI":"10.1109\/ICME.2018.8486441"},{"key":"9888_CR39","doi-asserted-by":"crossref","unstructured":"Variani, E., Lei, X., McDermott, E., Moreno, I. L., Gonzalez-Dominguez, J. (2014). Deep neural networks for small footprint text-dependent speaker verification. In IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp.\u00a04052\u20134056).","DOI":"10.1109\/ICASSP.2014.6854363"},{"key":"9888_CR40","doi-asserted-by":"crossref","unstructured":"Vloed, D. van der., Bouten, J., & Leeuwen, D. Van. (2014). NFI-FRITS: A forensic speaker recognition database and some first experiments. In Proceedings of Odyssey speaker and language recognition workshop (pp.\u00a06\u201313).","DOI":"10.21437\/Odyssey.2014-2"},{"key":"9888_CR41","doi-asserted-by":"crossref","unstructured":"Woo, R. H., Park, A., & Hazen, T. J. (2006). The MIT mobile device speaker verification corpus: Data collection and preliminary experiments. In IEEE Odyssey\u2014the speaker and language recognition workshop (pp.\u00a01\u20136).","DOI":"10.1109\/ODYSSEY.2006.248083"},{"key":"9888_CR42","doi-asserted-by":"crossref","unstructured":"Yu, H., Tan, Z.-H., Ma, Z., & Guo, J. (2017). Adversarial network bottleneck features for noise robust speaker verification. In INTERSPEECH (pp.\u00a01492\u20131496).","DOI":"10.21437\/Interspeech.2017-883"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-021-09888-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10772-021-09888-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-021-09888-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,10]],"date-time":"2023-11-10T14:09:26Z","timestamp":1699625366000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10772-021-09888-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,8,30]]},"references-count":42,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2023,9]]}},"alternative-id":["9888"],"URL":"https:\/\/doi.org\/10.1007\/s10772-021-09888-y","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"type":"print","value":"1381-2416"},{"type":"electronic","value":"1572-8110"}],"subject":[],"published":{"date-parts":[[2021,8,30]]},"assertion":[{"value":"26 July 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 August 2021","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 August 2021","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}