{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T00:48:54Z","timestamp":1740098934118,"version":"3.37.3"},"publisher-location":"Cham","reference-count":35,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319664286"},{"type":"electronic","value":"9783319664293"}],"license":[{"start":{"date-parts":[[2017,1,1]],"date-time":"2017-01-01T00:00:00Z","timestamp":1483228800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2017]]},"DOI":"10.1007\/978-3-319-66429-3_55","type":"book-chapter","created":{"date-parts":[[2017,8,12]],"date-time":"2017-08-12T02:02:55Z","timestamp":1502503375000},"page":"555-563","source":"Crossref","is-referenced-by-count":0,"title":["Neural Network Speaker Descriptor in Speaker Diarization of Telephone Speech"],"prefix":"10.1007","author":[{"given":"Zbyn\u011bk","family":"Zaj\u00edc","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jan","family":"Zelinka","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lud\u011bk","family":"M\u00fcller","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2017,8,13]]},"reference":[{"key":"55_CR1","doi-asserted-by":"crossref","unstructured":"Adami, A.G., Kajarekar, S.S., Hermansky, H.: A new speaker change detection method for two-speaker segmentation. In: ICASSP, vol. 4, pp. 3908\u20133911 (2002)","DOI":"10.1109\/ICASSP.2002.1004772"},{"key":"55_CR2","doi-asserted-by":"crossref","unstructured":"Bredin, H.: TristouNet: triplet loss for speaker turn embedding. In: ICASSP, New Orleans, pp. 5430\u20135434 (2017)","DOI":"10.1109\/ICASSP.2017.7953194"},{"key":"55_CR3","unstructured":"Canavan, A., Graff, D., Zipperlen, G.: CALLHOME American English speech, LDC97S42. In: LDC Catalog. Linguistic Data Consortium, Philadelphia (1997)"},{"issue":"4","key":"55_CR4","doi-asserted-by":"crossref","first-page":"788","DOI":"10.1109\/TASL.2010.2064307","volume":"19","author":"N Dehak","year":"2011","unstructured":"Dehak, N., Kenny, P.J., Dehak, R., Dumouchel, P., Ouellet, P.: Front-end factor analysis for speaker verification. IEEE Trans. Audio Speech Lang. Process. 19(4), 788\u2013798 (2011)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"55_CR5","doi-asserted-by":"crossref","first-page":"309","DOI":"10.1007\/11965152_28","volume":"4299","author":"JG Fiscus","year":"2006","unstructured":"Fiscus, J.G., Radde, N., Garofolo, J.S., Le, A., Ajot, J., Laprun, C.: The rich transcription 2006 spring meeting recognition evaluation. Mach. Learn. Multimodal Interact. 4299, 309\u2013322 (2006)","journal-title":"Mach. Learn. Multimodal Interact."},{"key":"55_CR6","unstructured":"Fredouille, C., Bozonnet, S., Evans, N.: The LIA-EURECOM RT 2009 Speaker Diarization System. In: NIST Rich Transcription Workshop (RT09), Melbourne, USA (2009)"},{"key":"55_CR7","doi-asserted-by":"crossref","unstructured":"Furui, S., Itoh, D.: Neural-network-based HMM adaptation for noisy speech. In: ICASSP, Salt Lake City, pp. 365\u2013368 (2001)","DOI":"10.1109\/ICASSP.2001.940843"},{"key":"55_CR8","doi-asserted-by":"crossref","unstructured":"Garcia-Romero, D., Espy-Wilson, C.Y.: Analysis of i-Vector length normalization in speaker recognition systems. In: Interspeech, Florence, pp. 249\u2013252 (2011)","DOI":"10.21437\/Interspeech.2011-53"},{"key":"55_CR9","doi-asserted-by":"crossref","unstructured":"Garcia-Romero, D., McCree, A., Shum, S., Brummer, N., Vaquero, C.: Unsupervised domain adaptation for i-Vector speaker recognition. In: Odyssey - Speaker and Language Recognition Workshop, Joensuu, pp. 260\u2013264 (2014)","DOI":"10.1109\/ICASSP.2014.6854362"},{"key":"55_CR10","doi-asserted-by":"crossref","unstructured":"Garcia-Romero, D., Snyder, D., Sell, G., Povey, D., McCree, A.: Speaker diarization using deep neural network embedings. In: ICASSP, New Orleans, pp. 4930\u20134934 (2017)","DOI":"10.1109\/ICASSP.2017.7953094"},{"key":"55_CR11","unstructured":"Graff, D., Miller, D., Walker, K.: Switchboard-2 phase III audio. In: LDC Catalog. Linguistic Data Consortium, Philadelphia (1999)"},{"key":"55_CR12","unstructured":"Graff, D., Walker, K., Canavan, A.: Switchboard-2 phase II, LDC99S79. In: LDC Catalog. Linguistic Data Consortium, Philadelphia (2002)"},{"key":"55_CR13","doi-asserted-by":"crossref","unstructured":"Gupta, V.: Speaker change point detection using deep neural nets. In: ICASSP, Brisbane, pp. 4420\u20134424 (2015)","DOI":"10.1109\/ICASSP.2015.7178806"},{"key":"55_CR14","doi-asserted-by":"crossref","unstructured":"Hershey, J.R., Chen, Z., Roux, J.L., Watanabe, S.: Deep clustering: discriminative embeddings for segmentation and separation. In: ICASSP, Shanghai, pp. 31\u201335 (2016)","DOI":"10.1109\/ICASSP.2016.7471631"},{"key":"55_CR15","doi-asserted-by":"crossref","unstructured":"Hr\u00faz, M., Zaj\u00edc, Z.: Convolutional neural network for speaker change detection in telephone speaker Diarization system. In: ICASSP, New Orleans, pp. 4945\u20134949 (2017)","DOI":"10.1109\/ICASSP.2017.7953097"},{"key":"55_CR16","unstructured":"Kenny, P.: Joint factor analysis of speaker and session variability: theory and algorithms. Technical report, Centre de Recherche Informatique de Montreal (2006)"},{"key":"55_CR17","unstructured":"Kenny, P., Dumouchel, P.: Experiments in speaker verification using factor analysis likelihood ratios. In: Odyssey - Speaker and Language Recognition Workshop, Toledo, pp. 219\u2013226 (2004)"},{"key":"55_CR18","doi-asserted-by":"crossref","unstructured":"Machlica, L., Zaj\u00edc, Z.: Factor analysis and nuisance attribute projection revisited. In: Interspeech, Portland, pp. 1570\u20131573 (2012)","DOI":"10.21437\/Interspeech.2012-339"},{"key":"55_CR19","doi-asserted-by":"crossref","unstructured":"Martin, A., Przybocki, M.: 2004 NIST speaker recognition evaluation, LDC 2006 S44. In: LDC Catalog. Linguistic Data Consortium, Philadelphia (2011)","DOI":"10.1109\/ODYSSEY.2006.248120"},{"key":"55_CR20","doi-asserted-by":"crossref","unstructured":"Milner, R., Hain, T.: DNN-based speaker clustering for speaker Diarisation. In: Interspeech, San Francisco, 08 September 2012, pp. 2185\u20132189 (2016)","DOI":"10.21437\/Interspeech.2016-126"},{"key":"55_CR21","unstructured":"NIST Multimodal Information Group: 2005 NIST Speaker Recognition Evaluation Training Data, LDC2011S01. In: LDC Catalog. Linguistic Data Consortium, Philadelphia (2011)"},{"key":"55_CR22","unstructured":"NIST Multimodal Information Group: 2006 NIST Speaker Recognition Evaluation Training Set, LDC2011S09. In: LDC Catalog (2011)"},{"key":"55_CR23","doi-asserted-by":"crossref","unstructured":"Rouvier, M., Dupuy, G., Gay, P., Khoury, E., Merlin, T., Meignier, S.: An open-source state-of-the-art toolbox for broadcast news Diarization. In: Interspeech, Lyon, p. 5 (2013)","DOI":"10.21437\/Interspeech.2013-383"},{"key":"55_CR24","doi-asserted-by":"crossref","unstructured":"Sell, G., Garcia-Romero, D.: Speaker Diarization with PLDA i-Vector scoring and unsupervised calibration. In: IEEE Spoken Language Technology Workshop, South Lake Tahoe, pp. 413\u2013417 (2014)","DOI":"10.1109\/SLT.2014.7078610"},{"key":"55_CR25","doi-asserted-by":"crossref","unstructured":"Sell, G., Garcia-Romero, D., Mccree, A.: Speaker Diarization with i-Vectors from DNN senone posteriors. In: Interspeech, Dresden, pp. 3096\u20133099 (2015)","DOI":"10.21437\/Interspeech.2015-109"},{"issue":"1","key":"55_CR26","doi-asserted-by":"crossref","first-page":"217","DOI":"10.1109\/TASLP.2013.2285474","volume":"22","author":"M Senoussaoui","year":"2014","unstructured":"Senoussaoui, M., Kenny, P., Stafylakis, T., Dumouchel, P.: A study of the Cosine distance-based mean shift for telephone speech diarization. Audio, Speech Lang. Process. 22(1), 217\u2013227 (2014)","journal-title":"Audio, Speech Lang. Process."},{"key":"55_CR27","doi-asserted-by":"crossref","unstructured":"Shum, S., Dehak, N., Chuangsuwanich, E., Reynolds, D., Glass, J.: Exploiting intra-conversation variability for speaker diarization. In: Interspeech, Florence, pp. 945\u2013948 (2011)","DOI":"10.21437\/Interspeech.2011-383"},{"issue":"10","key":"55_CR28","doi-asserted-by":"crossref","first-page":"2015","DOI":"10.1109\/TASL.2013.2264673","volume":"21","author":"SH Shum","year":"2013","unstructured":"Shum, S.H., Dehak, N., Dehak, R., Glass, J.R.: Unsupervised methods for speaker diarization: an integrated and iterative approach. Audio, Speech Lang. Process. 21(10), 2015\u20132028 (2013)","journal-title":"Audio, Speech Lang. Process."},{"key":"55_CR29","unstructured":"Theano Development Team: Theano: A Python Framework for Fast Computation of Mathematical Expressions. arXiv e-prints abs\/1605.0 (2016)"},{"key":"55_CR30","doi-asserted-by":"crossref","unstructured":"Wang, R., Gu, M., Li, L., Xu, M., Zheng, T.F.: Speaker segmentation using deep speaker vectors for fast speaker change scenarios. In: ICASSP, New Orleans, pp. 5420\u20135424 (2017)","DOI":"10.1109\/ICASSP.2017.7953192"},{"key":"55_CR31","doi-asserted-by":"crossref","unstructured":"Yells, S.H., Stolcke, A., Slaney, M.: Artificial neural network features for speaker diarization. In: Proceedings of IEEE Spoken Language Technology Workshop, pp. 402\u2013406. IEEE (2014)","DOI":"10.1109\/SLT.2014.7078608"},{"key":"55_CR32","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"411","DOI":"10.1007\/978-3-319-43958-7_49","volume-title":"Speech and Computer","author":"Z Zaj\u00edc","year":"2016","unstructured":"Zaj\u00edc, Z., Kune\u0161ov\u00e1, M., Radov\u00e1, V.: Investigation of segmentation in i-vector based speaker diarization of telephone speech. In: Ronzhin, A., Potapova, R., N\u00e9meth, G. (eds.) SPECOM 2016. LNCS, vol. 9811, pp. 411\u2013418. Springer, Cham (2016). doi: 10.1007\/978-3-319-43958-7_49"},{"key":"55_CR33","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1007\/978-3-642-23538-2_24","volume-title":"Text, Speech and Dialogue","author":"Z Zaj\u00edc","year":"2011","unstructured":"Zaj\u00edc, Z., Machlica, L., M\u00fcller, L.: Initialization of fMLLR with sufficient statistics from similar speakers. In: Habernal, I., Matou\u0161ek, V. (eds.) TSD 2011. LNCS, vol. 6836, pp. 187\u2013194. Springer, Heidelberg (2011). doi: 10.1007\/978-3-642-23538-2_24"},{"key":"55_CR34","doi-asserted-by":"crossref","unstructured":"Zelinka, J., Van\u011bk, J., M\u00fcller, L.: Neural-network-based spectrum processing for speech recognition and speaker verification. In: Statistical Language and Speech Processing, Budapest, vol. 9449, pp. 288\u2013299 (2015)","DOI":"10.1007\/978-3-319-25789-1_27"},{"key":"55_CR35","doi-asserted-by":"crossref","unstructured":"Zhu, W., Pelecanos, J.: Online speaker Diarization using adapted i-Vector transforms. In: ICASSP, Shanghai, pp. 5045\u20135049 (2016)","DOI":"10.1109\/ICASSP.2016.7472638"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-66429-3_55","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,1]],"date-time":"2022-08-01T03:37:09Z","timestamp":1659325029000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-66429-3_55"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017]]},"ISBN":["9783319664286","9783319664293"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-66429-3_55","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2017]]}}}