{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,29]],"date-time":"2025-12-29T18:51:48Z","timestamp":1767034308934,"version":"3.40.3"},"reference-count":133,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2012,7,26]],"date-time":"2012-07-26T00:00:00Z","timestamp":1343260800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2012,11]]},"DOI":"10.1007\/s00530-012-0266-0","type":"journal-article","created":{"date-parts":[[2012,7,25]],"date-time":"2012-07-25T01:00:54Z","timestamp":1343178054000},"page":"499-518","source":"Crossref","is-referenced-by-count":14,"title":["Speech information retrieval: a review"],"prefix":"10.1007","volume":"18","author":[{"given":"Ryan P.","family":"Hafen","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Michael J.","family":"Henry","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2012,7,26]]},"reference":[{"key":"266_CR1","doi-asserted-by":"crossref","unstructured":"Adami, A., Mihaescu, R., Reynolds, D., Godfrey, J.: Modeling prosodic dynamics for speaker recognition. In: Proceedings of the ICASSP, vol. 4, pp. 788\u2013791 (2003)","DOI":"10.1109\/ICASSP.2003.1202761"},{"key":"266_CR2","unstructured":"Allamanche, E., Herre, J., Hellmuth, O., Fr\u00f6ba, B., Kastner, T., Cremer, M.: Content-based identification of audio material using MPEG-7 low level description. In: Proceedings of the International Symposium of Music Information Retrieval (2001)"},{"key":"266_CR3","unstructured":"Allegro, S., Buchler, M., Launer, S.: Automatic sound classification inspired by auditory scene analysis. In: Consistent and Reliable Acoustic Cues for Sound Analysis (CRAC), One-Day Workshop, Aalborg, Denmark (2001)"},{"issue":"4","key":"266_CR4","first-page":"515","volume":"41","author":"W. Al-Sawalmeh","year":"2010","unstructured":"Al-Sawalmeh, W., Daqrouq, K., Daoud, O., Al-Qawasmi, A.: Speaker identification system-based mel frequency and wavelet transform using neural network classifier. Eur. J. Sci. Res. 41(4), 515\u2013525 (2010)","journal-title":"Eur. J. Sci. Res."},{"key":"266_CR5","doi-asserted-by":"crossref","unstructured":"Anguera, X., Wooters, C., Pardo, J.: Robust speaker diarization for meetings: ICSI RT06s evaluation system. In: Ninth International Conference on Spoken Language Processing (2006)","DOI":"10.1007\/11965152_31"},{"key":"266_CR6","doi-asserted-by":"crossref","unstructured":"Azmi, M., Tolba, H., Mahdy, S., Fashal, M.: Syllable-based automatic Arabic speech recognition. In: Proceedings of the 7th WSEAS International Conference on Signal Processing, Robotics and Automation, pp. 246\u2013250. World Scientific and Engineering Academy and Society (WSEAS), Greece (2008)","DOI":"10.1109\/ICOSP.2008.4697204"},{"issue":"3","key":"266_CR7","doi-asserted-by":"crossref","first-page":"75","DOI":"10.1109\/MSP.2009.932166","volume":"26","author":"J. Baker","year":"2009","unstructured":"Baker, J., Deng, L., Glass, J., Khudanpur, S., Lee, C., Morgan, N., O\u2019Shaugnessy, D.: Research developments and directions in speech recognition and understanding. Part 1. IEEE Signal Process. Mag. 26(3), 75\u201380 (2009)","journal-title":"IEEE Signal Process. Mag."},{"key":"266_CR8","unstructured":"Barbu, T.: A supervised text-independent speaker recognition approach. World Acad. Sci. Eng. Technol. 33 (2007)"},{"key":"266_CR9","unstructured":"Barras, C., Zhu, X., Meignier, S., Gauvain, J.: Improving speaker diarization. In: RT-04F Workshop (2004)"},{"key":"266_CR10","doi-asserted-by":"crossref","unstructured":"Benedetto, D., Caglioti, E., Loreto, V.: Language trees and zipping. Phys. Rev. Lett. 88(4) (2002)","DOI":"10.1103\/PhysRevLett.88.048702"},{"issue":"10\u201311","key":"266_CR11","doi-asserted-by":"crossref","first-page":"763","DOI":"10.1016\/j.specom.2007.02.006","volume":"49","author":"M. Benzeghiba","year":"2007","unstructured":"Benzeghiba, M., De Mori, R., Deroo, O., Dupont, S., Erbes, T., Jouvet, D., Fissore, L., Laface, P., Mertins, A., Ris, C., Rose, R., Tyagi, V., Wellekens, C.: Automatic speech recognition and speech variability: a review. Speech Commun. 49(10\u201311), 763\u2013786 (2007)","journal-title":"Speech Commun."},{"key":"266_CR12","doi-asserted-by":"crossref","first-page":"430","DOI":"10.1155\/S1110865704310024","volume":"4","author":"F. Bimbot","year":"2004","unstructured":"Bimbot, F., Bonastre, J.F., Fredouille, C., Gravier, G., Magrin-Chagnolleau, I., Meignier, S., Merlin, T., Ortega-Garc\u0131a, J.: A tutorial on text-independent speaker verification. EURASIP J. Appl. Signal Process. 4, 430\u2013451 (2004)","journal-title":"EURASIP J. Appl. Signal Process."},{"key":"266_CR13","doi-asserted-by":"crossref","unstructured":"Bonastre, J., Wils, F., Meignier, S.: ALIZE, a free toolkit for speaker recognition. In: Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2005), Philadelphia, USA, pp. 737\u2013740 (2005)","DOI":"10.1109\/ICASSP.2005.1415219"},{"key":"266_CR14","unstructured":"Bonastre, J., Scheffer, N., Matrouf, D., Fredouille, C., Larcher, A., Preti, A., Pouchoulin, G., Evans, N., Fauve, B., Mason, J.: ALIZE\/SpkDet: a state-of-the-art open source software for speaker recognition. In: Odyssey-The Speaker and Language Recognition Workshop (2008)"},{"key":"266_CR15","doi-asserted-by":"crossref","unstructured":"Brill, E.: Discovering the lexical features of a language. In: Proceedings of the 29th Annual Meeting on Association for Computational Linguistics, pp. 339\u2013340. Association for Computational Linguistics (1991)","DOI":"10.3115\/981344.981392"},{"issue":"2\u20133","key":"266_CR16","doi-asserted-by":"crossref","first-page":"230","DOI":"10.1016\/j.csl.2005.08.001","volume":"20","author":"N. Br\u00fcmmer","year":"2006","unstructured":"Br\u00fcmmer, N., du Preez, J.: Application-independent evaluation of speaker detection. Comput. Speech Lang. 20(2\u20133), 230\u2013275 (2006)","journal-title":"Comput. Speech Lang."},{"issue":"3","key":"266_CR17","doi-asserted-by":"crossref","first-page":"165","DOI":"10.1109\/TSA.2003.811538","volume":"11","author":"C. Burges","year":"2003","unstructured":"Burges, C., Platt, J., Jana, S.: Distortion discriminant analysis for audio fingerprinting. IEEE Trans. Speech Audio Process. 11(3), 165\u2013174 (2003)","journal-title":"IEEE Trans. Speech Audio Process."},{"key":"266_CR18","doi-asserted-by":"crossref","first-page":"029901","DOI":"10.1117\/1.3152242","volume":"18","author":"F. Camastra","year":"2009","unstructured":"Camastra, F., Vinciarelli, A., Yu, J.: Machine learning for audio, image and video analysis. J. Electron. Imaging 18, 029901 (2009)","journal-title":"J. Electron. Imaging"},{"key":"266_CR19","doi-asserted-by":"crossref","unstructured":"Campbell, J., Reynolds, D., Dunn, R.: Fusing high-and low-level features for speaker recognition. In: Eighth European Conference on Speech Communication and Technology (2003)","DOI":"10.21437\/Eurospeech.2003-727"},{"key":"266_CR20","doi-asserted-by":"crossref","unstructured":"Campbell, W., Sturim, D., Reynolds, D.: Support vector machines using GMM supervectors for speaker verification. IEEE Signal Process. Lett. 13(5) (2006)","DOI":"10.1109\/LSP.2006.870086"},{"issue":"2009","key":"266_CR21","doi-asserted-by":"crossref","first-page":"95","DOI":"10.1109\/MSP.2008.931100","volume":"26","author":"J.P. Campbell","year":"2009","unstructured":"Campbell, J.P., Shen, W., Campbell, W.M., Schwartz, R., Bonastre, J.F., Matrouf, D.: Forensic speaker recognition. Signal Process. Mag. IEEE 26(2009), 95\u2013103 (2009)","journal-title":"Signal Process. Mag. IEEE"},{"issue":"3","key":"266_CR22","doi-asserted-by":"crossref","first-page":"271","DOI":"10.1007\/s11265-005-4151-3","volume":"41","author":"P. Cano","year":"2005","unstructured":"Cano, P., Batlle, E., Kalker, T., Haitsma, J.: A review of audio fingerprinting. J. VLSI Signal Process. 41(3), 271\u2013284 (2005)","journal-title":"J. VLSI Signal Process."},{"key":"266_CR23","unstructured":"Canseco-Rodriguez, L., Lamel, L., Gauvain, J.: Speaker diarization from speech transcripts. In: Proceedings of the ICSLP, vol. 4 (2004)"},{"issue":"2","key":"266_CR24","doi-asserted-by":"crossref","first-page":"153","DOI":"10.1017\/S1355771801002126","volume":"6","author":"M. Casey","year":"2002","unstructured":"Casey, M.: General sound classification and similarity in MPEG-7. Organ. Sound 6(2), 153\u2013164 (2002)","journal-title":"Organ. Sound"},{"key":"266_CR25","doi-asserted-by":"crossref","unstructured":"Cohen, L.: Time frequency distributions\u2014a review. In: Proceedings of the IEEE, vol. 77 (1989)","DOI":"10.1109\/5.30749"},{"key":"266_CR26","unstructured":"de Jong, F., Gauvain, J.L., Hiemstra, D., Netter, K.: Language-based multimedia information retrieval. In: In 6th RIAO Conference (2000)"},{"key":"266_CR27","unstructured":"Dunning, T.: Statistical identification of language. Tech. Rep. MCCS 94-273, New Mexico State University (1994)"},{"key":"266_CR28","unstructured":"Dusan, S., Deng, L.: Estimation of articulatory parameters from speech acoustics by Kalman filtering. In: Proceedings of CITO Researcher Retreat-Hamilton (1998)"},{"key":"266_CR29","unstructured":"ELDA: Evaluations and Language Resources Distribution Agency (2010). http:\/\/www.elda.org\/"},{"issue":"7","key":"266_CR30","doi-asserted-by":"crossref","first-page":"1960","DOI":"10.1109\/TASL.2007.902877","volume":"15","author":"B.G.B. Fauve","year":"2007","unstructured":"Fauve, B.G.B., Matrouf, D., Scheffer, N., Bonastre, J.F.F., Mason, J.S.D.: State-of-the-art performance in text-independent speaker verification through open-source software. IEEE Trans. Audio Speech Lang. Process. 15(7), 1960\u20131968 (2007)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"266_CR31","doi-asserted-by":"crossref","unstructured":"Ferrer, L., Scheffer, N., Shriberg, E.: A comparison of approaches for modeling prosodic features in speaker recognition. In: IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP), pp. 4414\u20134417. IEEE, New York (2010)","DOI":"10.1109\/ICASSP.2010.5495632"},{"key":"266_CR32","doi-asserted-by":"crossref","unstructured":"Friedland, A., Vinyals, B., Huang, C., Muller, D.: Fusing short term and long term features for improved speaker diarization. In: Acoustics, Speech and Signal Processing, ICASSP 2009, pp. 4077\u20134080. IEEE (2009)","DOI":"10.1109\/ICASSP.2009.4960524"},{"key":"266_CR33","unstructured":"Fulop, S., Disner, S.: The reassigned spectrogram as a tool for voice identification. In: Proceedings of ICPhS 2007, pp. 1853\u20131856 (2007)"},{"key":"266_CR34","doi-asserted-by":"crossref","unstructured":"Fulop, S., Disner, S.: Advanced time-frequency displays applied to forensic speaker identification. Proc. Meet. Acoust. 6, 060008 (2009)","DOI":"10.1121\/1.3277007"},{"issue":"5","key":"266_CR35","doi-asserted-by":"crossref","first-page":"833","DOI":"10.1007\/BF02832422","volume":"10","author":"C. Gang","year":"2005","unstructured":"Gang, C., Hui, T., Xin-meng, C.: Audio segmentation via the similarity measure of audio feature vectors. Wuhan Univ. J. Nat. Sci. 10(5), 833\u2013837 (2005)","journal-title":"Wuhan Univ. J. Nat. Sci."},{"key":"266_CR36","unstructured":"Gannert, T.: A Speaker Verification System Under the Scope: Alize. Master\u2019s thesis, TMH (2007)"},{"key":"266_CR37","unstructured":"Gravier, G., Betser, M., Ben, M.: Audio Segmentation Toolkit, release 1.2. IRISA (2010)"},{"issue":"2","key":"266_CR38","doi-asserted-by":"crossref","first-page":"211","DOI":"10.1076\/jnmr.32.2.211.16746","volume":"32","author":"J. Haitsma","year":"2003","unstructured":"Haitsma, J., Kalker, T.: A highly robust audio fingerprinting system with an efficient search strategy. J. New Music Res. 32(2), 211\u2013221 (2003)","journal-title":"J. New Music Res."},{"key":"266_CR39","unstructured":"Haitsma, J., Kalker, T., Oostveen, J.: Robust audio hashing for content identification. In: Proceedings of the Content-Based Multimedia Indexing (2001)"},{"key":"266_CR40","doi-asserted-by":"crossref","unstructured":"Hansen, J., Bou-Ghazale, S., Sarikaya, R., Pellom, B.: Getting started with the SUSAS: speech under simulated and actual stress database. In: Robust Speech Processing Laboratory (1998)","DOI":"10.21437\/Eurospeech.1997-494"},{"key":"266_CR41","doi-asserted-by":"crossref","unstructured":"Hansen, J.H., Gavidia-Ceballos, L., Kaiser, J.F.: A nonlinear operator-based speech feature analysis method with application to vocal fold pathology assessment. In: IEEE Transactions on Biomedical Engineering (1998)","DOI":"10.1109\/10.661155"},{"issue":"1","key":"266_CR42","doi-asserted-by":"crossref","first-page":"51","DOI":"10.1109\/PROC.1978.10837","volume":"66","author":"F. Harris","year":"1978","unstructured":"Harris, F.: On the use of windows for harmonic analysis with the discrete Fourier transform. Proc. IEEE 66(1), 51\u201383 (1978)","journal-title":"Proc. IEEE"},{"issue":"4","key":"266_CR43","doi-asserted-by":"crossref","first-page":"1738","DOI":"10.1121\/1.399423","volume":"87","author":"H. Hermansky","year":"1990","unstructured":"Hermansky, H.: Perceptual linear predictive (PLP) analysis of speech. J. Acoust. Soc. Am. 87(4), 1738\u20131752 (1990)","journal-title":"J. Acoust. Soc. Am."},{"issue":"4","key":"266_CR44","doi-asserted-by":"crossref","first-page":"578","DOI":"10.1109\/89.326616","volume":"2","author":"H. Hermansky","year":"1994","unstructured":"Hermansky, H., Morgan, N.: RASTA processing of speech. IEEE Trans. Speech Audio Process. 2(4), 578\u2013589 (1994)","journal-title":"IEEE Trans. Speech Audio Process."},{"key":"266_CR45","unstructured":"Heymann, M.: sound: A sound interface for R. R package version 1.3 (2010). http:\/\/CRAN.R-project.org\/package=soun"},{"key":"266_CR46","volume-title":"Segmentation, Diarization and Speech Transcription: Surprise Data Unraveled","author":"M. Huijbregts","year":"2008","unstructured":"Huijbregts, M.: Segmentation, Diarization and Speech Transcription: Surprise Data Unraveled. PrintPartners Ipskamp, Enschede (2008)"},{"key":"266_CR47","unstructured":"ISIP: Automatic speech recognition (2010). http:\/\/www.isip.piconepress.com\/projects\/speech\/index.html"},{"key":"266_CR48","volume-title":"Statistical Methods for Speech Recognition","author":"F. Jelinek","year":"1997","unstructured":"Jelinek, F.: Statistical Methods for Speech Recognition. MIT Press, Cambridge (1997)"},{"key":"266_CR49","unstructured":"Jiang, D.N., Cai, L.H.: Speech emotion classification with the combination of statistic features and temporal features. In: IEEE International Conference on Multimedia and Expo (2004)"},{"key":"266_CR50","unstructured":"Jin, Q.: Robust Speaker Recognition. Ph.D. thesis, Carnegie Mellon University (2007)"},{"key":"266_CR51","doi-asserted-by":"crossref","unstructured":"Kajarekar, S., Ferrer, L., Stolcke, A., Shriberg, E.: Voice-based speaker recognition combining acoustic and stylistic features. In: Advances in Biometrics, pp. 183\u2013201 (2008)","DOI":"10.1007\/978-1-84628-921-7_10"},{"issue":"3","key":"266_CR52","doi-asserted-by":"crossref","first-page":"345","DOI":"10.1109\/TSA.2004.840940","volume":"13","author":"P. Kenny","year":"2005","unstructured":"Kenny, P., Boulianne, G., Dumouchel, P.: Eigenvoice modeling with sparse training data. IEEE Trans. Speech Audio Process. 13(3), 345\u2013354 (2005)","journal-title":"IEEE Trans. Speech Audio Process."},{"key":"266_CR53","doi-asserted-by":"crossref","unstructured":"Kimura, A., Kashino, K., Kurozumi, T., Murase, H.: Very quick audio searching: introducing global pruning to the time-series active search. In: IEEE International Conference on Acoustics Speech and Signal Processing, vol. 3 (2001)","DOI":"10.1109\/ICASSP.2001.941198"},{"issue":"1","key":"266_CR54","doi-asserted-by":"crossref","first-page":"12","DOI":"10.1016\/j.specom.2009.08.009","volume":"52","author":"T. Kinnunen","year":"2010","unstructured":"Kinnunen, T., Li, H.: An overview of text-independent speaker recognition: from features to supervectors. Speech Commun. 52(1), 12\u201340 (2010)","journal-title":"Speech Commun."},{"key":"266_CR55","unstructured":"Kinnunen, T.: Spectral features for automatic text-independent speaker recognition. Ph. Lic. thesis, University of Joensuu, Department of Computer Science (2004)"},{"key":"266_CR56","unstructured":"Larcher, A., L\u00e9vy, C., Matrouf, D., Bonastre, J.: LIA NIST-SRE\u201910 systems. Unpublished (2010)"},{"key":"266_CR57","unstructured":"LDC: Language Data Consortium (2010). http:\/\/www.ldc.upenn.edu\/"},{"key":"266_CR58","unstructured":"Lee, A., Kawahara, T., Takeda, K., Mimura, M., Yamada, A., Ito, A., Itou, K., Shikano, K.: Continuous speech recognition consortium\u2014an open repository for CSR tools and models. In: Proceedings of the IEEE International Conference on Language Resources and Evaluation (2002)"},{"key":"266_CR59","unstructured":"Lee, C.H.: Back to speech science-towards a collaborative ASR community of the 21st century. In: Dynamics of Speech Production and Perception, p. 221 (2006)"},{"issue":"5","key":"266_CR60","doi-asserted-by":"crossref","first-page":"619","DOI":"10.1109\/89.861383","volume":"8","author":"S. Li","year":"2002","unstructured":"Li, S.: Content-based audio classification and retrieval using the nearest feature line method. IEEE Trans. Speech Audio Process. 8(5), 619\u2013625 (2002)","journal-title":"IEEE Trans. Speech Audio Process."},{"issue":"5","key":"266_CR61","doi-asserted-by":"crossref","first-page":"533","DOI":"10.1016\/S0167-8655(00)00119-7","volume":"22","author":"D. Li","year":"2001","unstructured":"Li, D., Sethi, I., Dimitrova, N., McGee, T.: Classification of general audio data for content-based retrieval. Pattern Recognit. Lett. 22(5), 533\u2013544 (2001)","journal-title":"Pattern Recognit. Lett."},{"key":"266_CR62","doi-asserted-by":"crossref","unstructured":"Li, X., Tao, J., Johnson, M.T., Soltis, J., Savage, A., Leong, K.M., Newman, J.D.: Stress and emotion classification using Jitter and Shimmer features. In: IEEE International Conference on Acoustics Speech and Signal Processing (2007)","DOI":"10.1109\/ICASSP.2007.367261"},{"issue":"1","key":"266_CR63","doi-asserted-by":"crossref","first-page":"271","DOI":"10.1109\/TASL.2006.876860","volume":"15","author":"H. Li","year":"2007","unstructured":"Li, H., Ma, B., Lee, C.: A vector space modeling approach to spoken language identification. IEEE Trans. Audio Speech Lang. Process. 15(1), 271\u2013284 (2007)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"266_CR64","unstructured":"Linguistic Data Consortium (2010). http:\/\/www.ldc.upenn.edu\/"},{"key":"266_CR65","doi-asserted-by":"crossref","unstructured":"Liscombe, J., Riccardi, G., Hakkaini-T\u00fcr, D.: Using context to improve emotion detection in spoken dialog systems. In: Proceedings of Interspeech (2005)","DOI":"10.21437\/Interspeech.2005-583"},{"key":"266_CR66","doi-asserted-by":"crossref","unstructured":"Low, L.S.A., Maddage, N.C., Lech, M., Sheeber, L.B., Allen, N.B.: Detection of clinical depression n adolescents\u2019 speech during family interactions. In: IEEE Transactions on Biomedical Engineering (2011)","DOI":"10.1109\/TBME.2010.2091640"},{"issue":"6","key":"266_CR67","doi-asserted-by":"crossref","first-page":"482","DOI":"10.1007\/s00530-002-0065-0","volume":"8","author":"L. Lu","year":"2003","unstructured":"Lu, L., Zhang, H., Li, S.: Content-based audio classification and segmentation by using support vector machines. Multimed. Syst. 8(6), 482\u2013492 (2003)","journal-title":"Multimed. Syst."},{"key":"266_CR68","doi-asserted-by":"crossref","unstructured":"Lu, H., Pan, W., Lane, N., Choudhury, T., Campbell, A.: SoundSense: scalable sound sensing for people-centric applications on mobile phones. In: Proceedings of the 7th International Conference on Mobile Systems, Applications, and Services, pp. 165\u2013178. ACM, New York (2009)","DOI":"10.1145\/1555816.1555834"},{"key":"266_CR69","doi-asserted-by":"crossref","unstructured":"Ma, G., Zhou, W., Zheng, J., You, X., Ye, W.: A comparison between HTK and SPHINX on Chinese Mandarin. In: Proceedings of the 2009 International Joint Conference on Artificial Intelligence, pp. 394\u2013397. IEEE Computer Society, New York (2009)","DOI":"10.1109\/JCAI.2009.44"},{"key":"266_CR70","doi-asserted-by":"crossref","unstructured":"Makhoul, J.: Information extraction from speech. In: Spoken Language Technology Workshop, 2006, p. 3. IEEE, New York (2007)","DOI":"10.1109\/SLT.2006.326780"},{"issue":"2\u20133","key":"266_CR71","doi-asserted-by":"crossref","first-page":"303","DOI":"10.1016\/j.csl.2005.08.002","volume":"20","author":"S. Meignier","year":"2006","unstructured":"Meignier, S., Moraru, D., Fredouille, C., Bonastre, J., Besacier, L.: Step-by-step and integrated approaches in broadcast news speaker diarization. Comput. Speech Lang. 20(2\u20133), 303\u2013330 (2006)","journal-title":"Comput. Speech Lang."},{"key":"266_CR72","unstructured":"Meignier, S., Merlin, T.: Lium SpkDiarization: an open source toolkit for diarization. In: CMU SPUD Workshop (2010)"},{"key":"266_CR73","doi-asserted-by":"crossref","unstructured":"Meinedo, H., Neto, J.: Audio segmentation, classification and clustering in a broadcast news task. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP\u201903), vol. 2. IEEE, New York (2003)","DOI":"10.1109\/ICASSP.2003.1202280"},{"key":"266_CR74","doi-asserted-by":"crossref","unstructured":"Milner, B., Shao, X.: Speech reconstruction from mel-frequency cepstral coefficients using a source-filter model. In: Seventh International Conference on Spoken Language Processing (2002)","DOI":"10.21437\/ICSLP.2002-110"},{"key":"266_CR75","doi-asserted-by":"crossref","unstructured":"Miotto, R., Orio, N.: Automatic identification of music works through audio matching. In: ECDL (2007)","DOI":"10.1007\/978-3-540-74851-9_11"},{"key":"266_CR76","doi-asserted-by":"crossref","unstructured":"Moore, E. II, Clements, M.A., Peifer, J.W., Weisser, L.: Critical analysis of the impact of glottal features in the classification of clinical depression in speech. In: IEEE Transactions on Biomedical Engineering (2008)","DOI":"10.1109\/TBME.2007.900562"},{"key":"266_CR77","unstructured":"Nexidia: Nexidia Rich Media (2010). http:\/\/www.nexidia.com\/solutions\/rich_media"},{"key":"266_CR78","unstructured":"NIST: Nist Language Recognition Evaluation (2010). http:\/\/www.itl.nist.gov\/iad\/mig\/tests\/lre\/"},{"key":"266_CR79","unstructured":"NIST: Nist Speaker Recognition Evaluation (2010). http:\/\/www.itl.nist.gov\/iad\/mig\/\/tests\/sre\/"},{"key":"266_CR80","unstructured":"NIST: Rich Transcription Evaluation Project (2010). http:\/\/www.itl.nist.gov\/iad\/mig\/\/tests\/rt\/"},{"key":"266_CR81","unstructured":"Nwe, T.L., Wei, F.S., Silva, L.D.: Speech based emotion classification. In: Proceedings of IEEE Region 10 International Conference on Electrical and Electronic Technology (2001)"},{"key":"266_CR82","unstructured":"OLAC: Open Language Archives Community (2010). http:\/\/www.language-archives.org\/"},{"issue":"9","key":"266_CR83","doi-asserted-by":"crossref","first-page":"1272","DOI":"10.1109\/JPROC.2003.817117","volume":"91","author":"D. O\u2019Shaughnessy","year":"2003","unstructured":"O\u2019Shaughnessy, D.: Interacting with computers by voice: automatic speech recognition and synthesis. Proc. IEEE 91(9), 1272\u20131305 (2003)","journal-title":"Proc. IEEE"},{"key":"266_CR84","unstructured":"Padgett, C., Cottrell, G.: Representing face images for emotion classification. In: Advances in Neural Information Processing Systems (1997)"},{"key":"266_CR85","doi-asserted-by":"crossref","unstructured":"Pallett, D.: A look at NIST\u2019s benchmark ASR tests: past, present, and future. In: Proceedings of the 2003 IEEE Workshop on Automatic Speech Recognition and Understanding (2003)","DOI":"10.1109\/ASRU.2003.1318488"},{"issue":"1\/2","key":"266_CR86","first-page":"23","volume":"49","author":"C. Papaodysseus","year":"2001","unstructured":"Papaodysseus, C., Roussopoulos, G., Fragoulis, D., Panagopoulos, T., Alexiou, C.: A new approach to the automatic recognition of musical recordings. J. Audio Eng. Soc. 49(1\/2), 23\u201335 (2001)","journal-title":"J. Audio Eng. Soc."},{"key":"266_CR87","unstructured":"Pelecanos, J., Sridharan, S.: Feature warping for robust speaker verification. In: A Speaker Odyssey-The Speaker Recognition Workshop (2001)"},{"key":"266_CR88","doi-asserted-by":"crossref","unstructured":"Petrovska-Delacr\u00e9taz, D., El Hannani, A., Chollet, G.: Text-independent speaker verification: state of the art and challenges. In: Progress in Nonlinear Speech Processing, pp. 135\u2013169 (2007)","DOI":"10.1007\/978-3-540-71505-4_9"},{"key":"266_CR89","unstructured":"Poutsma, A.: Applying Monte Carlo techniques to language identification. In: Proceedings of Computational Linguistics in the Netherlands (CLIN) (2001)"},{"key":"266_CR90","unstructured":"R Development Core Team: R: A Language and Environment for Statistical Computing. R Foundation for Statistical Computing, Vienna (2010). http:\/\/www.R-project.or . ISBN 3-900051-07-0"},{"issue":"12","key":"266_CR91","doi-asserted-by":"crossref","first-page":"2801","DOI":"10.1016\/S0031-3203(01)00235-7","volume":"35","author":"R. Ramachandran","year":"2002","unstructured":"Ramachandran, R., Farrell, K., Ramachandran, R., Mammone, R.: Speaker recognition\u2014general classifier approaches and data fusion methods. Pattern Recognit. 35(12), 2801\u20132821 (2002)","journal-title":"Pattern Recognit."},{"key":"266_CR92","doi-asserted-by":"crossref","unstructured":"Ravindran, S., Anderson, D., Slaney, M.: Low-power audio classification for ubiquitous sensor networks. In: Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (2004)","DOI":"10.1109\/ICASSP.2004.1326832"},{"key":"266_CR93","unstructured":"Recognition Technologies, Inc. (2010). http:\/\/www.recotechnologies.com"},{"key":"266_CR94","doi-asserted-by":"crossref","first-page":"357","DOI":"10.1007\/978-3-642-00382-0_29","volume":"5449","author":"R. Rehurek","year":"2009","unstructured":"Rehurek, R., Kolkus, M.: Language identification on the web: extending the dictionary method. Lect. Notes Comput. Sci. 5449, 357\u2013368 (2009)","journal-title":"Lect. Notes Comput. Sci."},{"key":"266_CR95","unstructured":"Reynolds, D.: An overview of automatic speaker recognition technology. IEEE Int. Conf. Acoust. Speech Signal Process. 4, 4072\u20134075 (2002)"},{"key":"266_CR96","doi-asserted-by":"crossref","unstructured":"Reynolds, D.: Channel robust speaker verification via feature mapping. In: Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP\u201903), vol. 2 (2003)","DOI":"10.1109\/ICASSP.2003.1202292"},{"key":"266_CR97","unstructured":"Reynolds, D., Campbell, J., Campbell, W., Dunn, R., Gleason, T., Jones, D., Quatieri, T., Quillen, C., Sturim, D., Torres-Carrasquillo, P.: Beyond cepstra: exploiting high-level information in speaker recognition. In: Proceedings of the Workshop on Multimodal User Authentication, pp. 223\u2013229 (2003)"},{"key":"266_CR98","unstructured":"Reynolds, D., Torres-Carrasquillo, P.: The MIT Lincoln laboratory RT-04F diarization systems: applications to broadcast audio and telephone conversations. In: RT-04F Workshop (2004)"},{"key":"266_CR99","doi-asserted-by":"crossref","unstructured":"Reynolds, D., Torres-Carrasquillo, P.: Approaches and applications of audio diarization. In: Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP\u201905) (2005)","DOI":"10.1109\/ICASSP.2005.1416463"},{"key":"266_CR100","doi-asserted-by":"crossref","DOI":"10.1201\/9780203166369","volume-title":"Forensic Speaker Identification","author":"P. Rose","year":"2002","unstructured":"Rose, P.: Forensic Speaker Identification. CRC, Boca Raton (2002)"},{"key":"266_CR101","unstructured":"Satori, H., Hiyassat, H., Harti, M., Chenfour, N.: Investigation Arabic speech recognition using CMU Sphinx system. Int. Arab J. Inf. Technol. 6(2) (2009)"},{"key":"266_CR102","doi-asserted-by":"crossref","unstructured":"Schuller, B., Batliner, A., Seppi, D., Steidl, S., Vogt, T., Wagner, J., Devillers, L., Vidrascu, L., Amir, N., Kessous, L., Aharonson, V.: The relevance of feature type for the automatic classification of emotional user states: low level descriptors and functionals. In: INTERSPEECH (2007)","DOI":"10.21437\/Interspeech.2007-612"},{"key":"266_CR103","doi-asserted-by":"crossref","unstructured":"Sinha, R., Tranter, S., Gales, M., Woodland, P.: The Cambridge University March 2005 speaker diarisation system. In: Ninth European Conference on Speech Communication and Technology (2005)","DOI":"10.21437\/Interspeech.2005-650"},{"key":"266_CR104","doi-asserted-by":"crossref","unstructured":"Sonmez, M., Heck, L., Weintraub, M., Shriberg, E.: A lognormal tied mixture model of pitch for prosody-based speaker recognition. In: Proceedings of the Eurospeech, vol. 3, pp. 1391\u20131394 (1997)","DOI":"10.21437\/Eurospeech.1997-346"},{"key":"266_CR105","doi-asserted-by":"crossref","unstructured":"Sonmez, K., Shriberg, E., Heck, L., Weintraub, M.: Modeling dynamic prosodic variation for speaker verification. In: Fifth International Conference on Spoken Language Processing (1998)","DOI":"10.21437\/ICSLP.1998-254"},{"key":"266_CR106","unstructured":"SpeecFind: Search the Speech from Last Century (2010). http:\/\/speechfind.utdallas.edu\/"},{"key":"266_CR107","doi-asserted-by":"crossref","unstructured":"Stallard, D., Prasad, R., Natarajan, P.: Development and internal evaluation of speech-to-speech translation technology at BBN. In: PerMIS \u201909: Proceedings of the 9th Workshop on Performance Metrics for Intelligent Systems, pp. 231\u2013237. ACM, New York (2009). doi: 10.1145\/1865909.1865956","DOI":"10.1145\/1865909.1865956"},{"key":"266_CR108","doi-asserted-by":"crossref","first-page":"185","DOI":"10.1121\/1.1915893","volume":"8","author":"S. Stevens","year":"1937","unstructured":"Stevens, S., Volkmann, J., Newman, E.: A scale for the measurement of the psychological magnitude pitch. J. Acoust. Soc. Am. 8, 185 (1937)","journal-title":"J. Acoust. Soc. Am."},{"key":"266_CR109","doi-asserted-by":"crossref","unstructured":"Sueur, J., Aubin, T., Simonis, C.: Seewave: a free modular tool for sound analysis and synthesis. Bioacoustics 18, 213\u2013226 (2008). http:\/\/sueur.jerome.perso.neuf.fr\/WebPage_PapersPDF\/Sueuretal_Bioacoustics_2008.pdf","DOI":"10.1080\/09524622.2008.9753600"},{"key":"266_CR110","doi-asserted-by":"crossref","unstructured":"Sukittanon, S., Atlas, L.: Modulation frequency features for audio fingerprinting. In: IEEE International Conference on Acoustics Speech and Signal Processing, vol. 2 (2002)","DOI":"10.1109\/ICASSP.2002.5744966"},{"key":"266_CR111","unstructured":"Switchboard: Spontaneous conversation corpus (2010). http:\/\/www.isip.piconepress.com\/projects\/switchboard\/html\/overview.html"},{"key":"266_CR112","doi-asserted-by":"crossref","unstructured":"Teager, H.: Some observations on oral air flow during phonation. In: IEEE Transactions on Acoustics, Speech and Signal Processing (1980)","DOI":"10.1109\/TASSP.1980.1163453"},{"key":"266_CR113","doi-asserted-by":"crossref","unstructured":"Tokuhisa, R., Inui, K., Matsumoto, Y.: Emotion classification using massive examples extracted from the web. In: Proceedings of the 22nd International Conference on Computational Linguistics (2008)","DOI":"10.3115\/1599081.1599192"},{"key":"266_CR114","doi-asserted-by":"crossref","unstructured":"Tong, R., Ma, B., Zhu, D., Li, H., Chng, E.S.: Integrating acoustic, prosodic and phonotactic features for spoken language identification. In: IEEE International Conference on Acoustics, Speech and Signal Processing (2006)","DOI":"10.1109\/ICASSP.2006.1659993"},{"issue":"5","key":"266_CR115","doi-asserted-by":"crossref","first-page":"1557","DOI":"10.1109\/TASL.2006.878256","volume":"14","author":"S. Tranter","year":"2006","unstructured":"Tranter, S., Reynolds, D.: An overview of automatic speaker diarization systems. IEEE Trans. Audio Speech Lang. Process. 14(5), 1557\u20131565 (2006)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"266_CR116","unstructured":"Tzanetakis, G., Cook, F.: A framework for audio analysis based on classification and temporal segmentation. In: Proceedings of the 25th EUROMICRO Conference, 1999, vol. 2, pp. 61\u201367. IEEE, New York (2002)"},{"key":"266_CR117","unstructured":"Urbanek, S.: audio: Audio Interface for R (2012). http:\/\/www.rforge.net\/audio . R package version 0.1-3"},{"key":"266_CR118","unstructured":"Vertanen, K.: Baseline WSJ acoustic models for HTK and Sphinx: training recipes and recognition experiments. Tech. rep., Cavendish Laboratory, University of Cambridge (2006)"},{"key":"266_CR119","unstructured":"VoxForge: Free speech\u2026 recognition (2010). http:\/\/voxforge.org\/"},{"key":"266_CR120","unstructured":"Walker, W., Lamere, P., Kwok, P., Raj, B., Singh, R., Gouvea, E., Wolf, P., Woelfel, J.: Sphinx-4: A Flexible Open Source Framework for Speech Recognition, p. 18. Sun Microsystems, Inc., Mountain View (2004)"},{"key":"266_CR121","unstructured":"Wang, A.: An industrial strength audio search algorithm. In: International Conference on Music Information Retrieval (ISMIR) (2003)"},{"key":"266_CR122","doi-asserted-by":"crossref","unstructured":"Wassner, H., Chollet, G.: New cepstral representation using wavelet analysis and spectral transformation for robust speech recognition. In: Proceedings of ICSLP, vol. 96 (1996)","DOI":"10.1109\/ICSLP.1996.607094"},{"key":"266_CR123","doi-asserted-by":"crossref","unstructured":"Woodland, P., Odell, J., Valtchev, V., Young, S.: Large vocabulary continuous speech recognition using HTK. In: IEEE International Conference on Acoustics, Speech, and Signal Processing. ICASSP-94, vol. 2 (1994)","DOI":"10.1109\/ICASSP.1994.389562"},{"key":"266_CR124","doi-asserted-by":"crossref","unstructured":"Wooters, C., Huijbregts, M.: The ICSI RT07s speaker diarization system. In: Multimodal Technologies for Perception of Humans, pp. 509\u2013519 (2009)","DOI":"10.1007\/978-3-540-68585-2_47"},{"key":"266_CR125","doi-asserted-by":"crossref","unstructured":"Xu, M., Duan, L., Cai, J., Chia, L., Xu, C., Tian, Q.: HMM-based audio keyword generation. In: Advances in Multimedia Information Processing-PCM 2004, pp. 566\u2013574 (2005)","DOI":"10.1007\/978-3-540-30543-9_71"},{"key":"266_CR126","doi-asserted-by":"crossref","unstructured":"Yang, C., Lin, K.H.Y., Chen, H.H.: Emotion classification using web blog corpora. In: Proceedings of the IEEE\/WIC\/ACM International Conference on Web Intelligence (2007)","DOI":"10.1109\/WI.2007.51"},{"key":"266_CR127","volume-title":"The HTK Book","author":"S. Young","year":"2002","unstructured":"Young, S., Evermann, G., Kershaw, D., Moore, G., Odell, J., Ollason, D., Valtchev, V., Woodland, P.: The HTK Book. Cambridge University Engineering Department, Cambridge (2002)"},{"key":"266_CR128","doi-asserted-by":"crossref","unstructured":"Zhang, T., Kuo, C.: Hierarchical classification of audio data for archiving and retrieving. In: Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP, vol. 6, pp. 3001\u20133004 (1999)","DOI":"10.1109\/ICASSP.1999.757472"},{"key":"266_CR129","doi-asserted-by":"crossref","unstructured":"Zhang, J., Whalley, J., Brooks, S.: A two phase method for general audio segmentation. In: IEEE International Conference on Multimedia and Expo, ICME 2009, pp. 626\u2013629. IEEE (2009)","DOI":"10.1109\/ICME.2009.5202574"},{"key":"266_CR130","unstructured":"Zhang, X.: Audio Segmentation, Classification and Visualization. Ph.D. thesis, Auckland University of Technology (2009)"},{"key":"266_CR131","doi-asserted-by":"crossref","unstructured":"Zhu, X., Barras, C., Meignier, S., Gauvain, J.: Combining speaker identification and BIC for speaker diarization. In: Ninth European Conference on Speech Communication and Technology (2005)","DOI":"10.21437\/Interspeech.2005-651"},{"key":"266_CR132","doi-asserted-by":"crossref","unstructured":"Zhu, X., Barras, C., Lamel, L., Gauvain, J.: Speaker diarization: from broadcast news to lectures. In: Machine Learning for Multimodal Interaction, pp. 396\u2013406 (2006)","DOI":"10.1007\/11965152_35"},{"key":"266_CR133","doi-asserted-by":"crossref","first-page":"248","DOI":"10.1121\/1.1908630","volume":"33","author":"E. Zwicker","year":"1961","unstructured":"Zwicker, E.: Subdivision of the audible frequency range into critical bands (Frequenzgruppen). Acoust. Soc. Am. J. 33, 248 (1961)","journal-title":"Acoust. Soc. Am. J."}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-012-0266-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s00530-012-0266-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-012-0266-0","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,5]],"date-time":"2025-04-05T10:06:12Z","timestamp":1743847572000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s00530-012-0266-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012,7,26]]},"references-count":133,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2012,11]]}},"alternative-id":["266"],"URL":"https:\/\/doi.org\/10.1007\/s00530-012-0266-0","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"type":"print","value":"0942-4962"},{"type":"electronic","value":"1432-1882"}],"subject":[],"published":{"date-parts":[[2012,7,26]]}}}