{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T00:51:54Z","timestamp":1740099114792,"version":"3.37.3"},"publisher-location":"Cham","reference-count":43,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319995786"},{"type":"electronic","value":"9783319995793"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-319-99579-3_14","type":"book-chapter","created":{"date-parts":[[2018,8,24]],"date-time":"2018-08-24T07:36:09Z","timestamp":1535096169000},"page":"123-133","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Speaker Diarization: A Top-Down Approach Using Syllabic Phonology"],"prefix":"10.1007","author":[{"given":"Erik","family":"Edwards","sequence":"first","affiliation":[]},{"given":"Amanda","family":"Robinson","sequence":"additional","affiliation":[]},{"given":"Najmeh","family":"Sadoughi","sequence":"additional","affiliation":[]},{"given":"Greg P.","family":"Finley","sequence":"additional","affiliation":[]},{"given":"Maxim","family":"Korenevsky","sequence":"additional","affiliation":[]},{"given":"Michael","family":"Brenndoerfer","sequence":"additional","affiliation":[]},{"given":"Nico","family":"Axtmann","sequence":"additional","affiliation":[]},{"given":"Mark","family":"Miller","sequence":"additional","affiliation":[]},{"given":"David","family":"Suendermann-Oeft","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,8,25]]},"reference":[{"doi-asserted-by":"crossref","unstructured":"Anguera Mir\u00f3, X.: Robust speaker diarization for meetings. Ph.D. thesis, Univ. Polit\u00e8cnica de Catalunya (2006)","key":"14_CR1","DOI":"10.21437\/Interspeech.2006-466"},{"issue":"2","key":"14_CR2","doi-asserted-by":"publisher","first-page":"356","DOI":"10.1109\/TASL.2011.2125954","volume":"20","author":"X Anguera Mir\u00f3","year":"2012","unstructured":"Anguera Mir\u00f3, X., Bozonnet, S., Evans, N., Fredouille, C., Friedland, G., Vinyals, O.: Speaker diarization: a review of recent research. IEEE Trans. Audio Speech Lang. Process. 20(2), 356\u2013370 (2012)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"14_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"402","DOI":"10.1007\/11677482_34","volume-title":"Machine Learning for Multimodal Interaction","author":"X Anguera","year":"2006","unstructured":"Anguera, X., Wooters, C., Peskin, B., Aguil\u00f3, M.: Robust speaker segmentation for meetings: the ICSI-SRI spring 2005 diarization system. In: Renals, S., Bengio, S. (eds.) MLMI 2005. LNCS, vol. 3869, pp. 402\u2013414. Springer, Heidelberg (2006). https:\/\/doi.org\/10.1007\/11677482_34"},{"doi-asserted-by":"crossref","unstructured":"Bozonnet, S., Vipperla, R., Evans, N.: Phone adaptive training for speaker diarization. In: Proceedings of INTERSPEECH, pp. 494\u2013497. ISCA (2012)","key":"14_CR4","DOI":"10.21437\/Interspeech.2012-166"},{"doi-asserted-by":"crossref","unstructured":"Chen, I.F., Cheng, S.S., Wang, H.M.: Phonetic subspace mixture model for speaker diarization. In: Proceedings of INTERSPEECH, pp. 2298\u20132301. ISCA (2010)","key":"14_CR5","DOI":"10.21437\/Interspeech.2010-630"},{"issue":"6","key":"14_CR6","doi-asserted-by":"publisher","first-page":"597","DOI":"10.1121\/1.1906940","volume":"24","author":"F Cooper","year":"1952","unstructured":"Cooper, F., Delattre, P., Liberman, A., Borst, J., Gerstman, L.: Some experiments on the perception of synthetic speech sounds. J. Acoust. Soc. Am. 24(6), 597\u2013606 (1952)","journal-title":"J. Acoust. Soc. Am."},{"key":"14_CR7","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"512","DOI":"10.1007\/978-3-319-66429-3_51","volume-title":"Speech and Computer","author":"E Edwards","year":"2017","unstructured":"Edwards, E., et al.: Medical speech recognition: reaching parity with humans. In: Karpov, A., Potapova, R., Mporas, I. (eds.) SPECOM 2017. LNCS (LNAI), vol. 10458, pp. 512\u2013524. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-66429-3_51"},{"issue":"1","key":"14_CR8","doi-asserted-by":"publisher","first-page":"57","DOI":"10.1016\/0167-6393(93)90018-G","volume":"12","author":"N Fakotakis","year":"1993","unstructured":"Fakotakis, N., Tsopanoglou, A., Kokkinakis, G.: A text-independent speaker recognition system based on vowel spotting. Speech Commun. 12(1), 57\u201368 (1993)","journal-title":"Speech Commun."},{"doi-asserted-by":"crossref","unstructured":"Finley, G., et al.: An automated medical scribe for documenting clinical encounters. In: Proceedings of NAACL. ACL (2018)","key":"14_CR9","DOI":"10.18653\/v1\/N18-5003"},{"issue":"2","key":"14_CR10","doi-asserted-by":"publisher","first-page":"359","DOI":"10.1017\/S0022226700011312","volume":"23","author":"E Fudge","year":"1987","unstructured":"Fudge, E.: Branching structure within the syllable. J. Linguist. 23(2), 359\u2013377 (1987)","journal-title":"J. Linguist."},{"issue":"1","key":"14_CR11","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1109\/TASSP.1975.1162631","volume":"23","author":"O Fujimura","year":"1975","unstructured":"Fujimura, O.: Syllable as a unit of speech recognition. IEEE Trans. Acoust. 23(1), 82\u201387 (1975)","journal-title":"IEEE Trans. Acoust."},{"doi-asserted-by":"crossref","unstructured":"Gauvain, J.L., Adda, G., Lamel, L., Adda-Decker, M.: Transcribing broadcast news: the LIMSI Nov96 Hub4 system. In: Proceedings of DARPA Speech Recognition Workshop, pp. 56\u201363. DARPA (1997)","key":"14_CR12","DOI":"10.21437\/Eurospeech.1997-323"},{"doi-asserted-by":"crossref","unstructured":"Ghahremani, P., BabaAli, B., Povey, D., Riedhammer, K., Trmal, J., Khudanpur, S.: A pitch extraction algorithm tuned for automatic speech recognition. In: Proceedings of ICASSP, pp. 2494\u20132498. IEEE (2014)","key":"14_CR13","DOI":"10.1109\/ICASSP.2014.6854049"},{"doi-asserted-by":"crossref","unstructured":"Gish, H., Siu, M.H., Rohlicek, J.: Segregation of speakers for speech recognition and speaker identification. In: Proceedings of ICASSP, vol. 2, pp. 873\u2013876. IEEE (1991)","key":"14_CR14","DOI":"10.1109\/ICASSP.1991.150477"},{"doi-asserted-by":"crossref","unstructured":"Goldsmith, J.: The syllable. In: Goldsmith, J., Riggle, J., Yu, A. (eds.) The Handbook of Phonological Theory, 2nd edn., pp. 165\u2013196. Wiley, Malden (2011)","key":"14_CR15","DOI":"10.1002\/9781444343069.ch6"},{"key":"14_CR16","volume-title":"A History of English Rhythms","author":"E Guest","year":"1838","unstructured":"Guest, E.: A History of English Rhythms. W. Pickering, London (1838)"},{"unstructured":"Hansen, E., Slyh, R., Anderson, T.: Speaker recognition using phoneme-specific GMMs. In: Proceedings of Odyssey Workshop, pp. 179\u2013184. ISCA (2004)","key":"14_CR17"},{"doi-asserted-by":"crossref","unstructured":"Hsieh, C.H., Wu, C.H., Shen, H.P.: Adaptive decision tree-based phone cluster models for speaker clustering. In: Proceedings of INTERSPEECH, pp. 861\u2013864. ISCA (2008)","key":"14_CR18","DOI":"10.21437\/Interspeech.2008-276"},{"issue":"3","key":"14_CR19","doi-asserted-by":"publisher","first-page":"295","DOI":"10.1006\/jmla.1997.2522","volume":"37","author":"B Kessler","year":"1997","unstructured":"Kessler, B., Treiman, R.: Syllable structure and the distribution of phonemes in English syllables. J. Mem. Lang. 37(3), 295\u2013311 (1997)","journal-title":"J. Mem. Lang."},{"unstructured":"Kozhevnikov, V., Chistovich, L.: Speech: articulation and perception. Translation JPRS 30543, Joint Public Research Service, U.S. Department of Commerce (1965)","key":"14_CR20"},{"issue":"4","key":"14_CR21","doi-asserted-by":"publisher","first-page":"1035","DOI":"10.1002\/j.1538-7305.1983.tb03114.x","volume":"62","author":"S Levinson","year":"1983","unstructured":"Levinson, S., Rabiner, L., Sondhi, M.: An introduction to the application of the theory of probabilistic functions of a Markov process to automatic speech recognition. Bell Syst. Tech. J. 62(4), 1035\u20131074 (1983)","journal-title":"Bell Syst. Tech. J."},{"issue":"11","key":"14_CR22","doi-asserted-by":"publisher","first-page":"1490","DOI":"10.1121\/1.1907654","volume":"31","author":"A Liberman","year":"1959","unstructured":"Liberman, A., Ingemann, F., Lisker, L., Delattre, P., Cooper, F.: Minimal rules for synthesizing speech. J. Acoust. Soc. Am. 31(11), 1490\u20131499 (1959)","journal-title":"J. Acoust. Soc. Am."},{"unstructured":"Martin, T., Wong, E., Baker, B., Mason, M., Sridharan, S.: Pitch and energy trajectory modelling in a syllable length temporal framework for language identification. In: Proceedings of Odyssey Workshop, pp. 289\u2013296. ISCA (2004)","key":"14_CR23"},{"issue":"10","key":"14_CR24","doi-asserted-by":"publisher","first-page":"782","DOI":"10.1016\/j.specom.2008.04.010","volume":"50","author":"L Mary","year":"2008","unstructured":"Mary, L., Yegnanarayana, B.: Extraction and representation of prosodic features for language and speaker recognition. Speech Commun. 50(10), 782\u2013796 (2008)","journal-title":"Speech Commun."},{"key":"14_CR25","volume-title":"An Inquiry into the Principles of Harmony in Language, and of the Mechanism of Verse, Modern and Antient","author":"W Mitford","year":"1804","unstructured":"Mitford, W.: An Inquiry into the Principles of Harmony in Language, and of the Mechanism of Verse, Modern and Antient, 2nd edn. L. Hansard, London (1804)","edition":"2"},{"issue":"6","key":"14_CR26","doi-asserted-by":"publisher","first-page":"1072","DOI":"10.1121\/1.1908561","volume":"28","author":"H Olson","year":"1956","unstructured":"Olson, H., Belar, H.: Phonetic typewriter. J. Acoust. Soc. Am. 28(6), 1072\u20131081 (1956)","journal-title":"J. Acoust. Soc. Am."},{"doi-asserted-by":"crossref","unstructured":"Panayotov, V., Chen, G., Povey, D., Khudanpur, S.: LibriSpeech: an ASR corpus based on public domain audio books. In: Proceedings of ICASSP, pp. 5206\u20135210. IEEE (2015)","key":"14_CR27","DOI":"10.1109\/ICASSP.2015.7178964"},{"unstructured":"Rudnicky, A.: CMUdict 0.7b: School of Computer Science, Carnegie Mellon University, Pittsburgh, PA (2015). https:\/\/github.com\/Alexir\/CMUdict","key":"14_CR28"},{"issue":"3","key":"14_CR29","doi-asserted-by":"publisher","first-page":"197","DOI":"10.1109\/LSP.2013.2237903","volume":"20","author":"S Sadjadi","year":"2013","unstructured":"Sadjadi, S., Hansen, J.: Unsupervised speech activity detection using voicing measures and perceptual spectral flux. IEEE Signal Process. Lett. 20(3), 197\u2013200 (2013)","journal-title":"IEEE Signal Process. Lett."},{"unstructured":"Saussure, F.: Cours de linguistique g\u00e9n\u00e9rale. Payot, Lausanne, Paris (1916)","key":"14_CR30"},{"doi-asserted-by":"crossref","unstructured":"Schindler, C., Draxler, C.: Using spectral moments as a speaker specific feature in nasals and fricatives. In: Proceedings of INTERSPEECH, pp. 2793\u20132796. ISCA (2013)","key":"14_CR31","DOI":"10.21437\/Interspeech.2013-639"},{"unstructured":"Selkirk, E.: The syllable. In: van der Hulst, H., Smith, N. (eds.) The Structure of Phonological Representations, vol. 2, pp. 337\u2013384. Foris, Dordrecht (1982)","key":"14_CR32"},{"key":"14_CR33","first-page":"125","volume-title":"Trends in Speech Recognition","author":"J Shoup","year":"1980","unstructured":"Shoup, J.: Phonological aspects of speech recognition. In: Lea, W. (ed.) Trends in Speech Recognition, pp. 125\u2013138. Prentice-Hall, Englewood Cliffs (1980)"},{"issue":"3\u20134","key":"14_CR34","doi-asserted-by":"publisher","first-page":"455","DOI":"10.1016\/j.specom.2005.02.018","volume":"46","author":"E Shriberg","year":"2005","unstructured":"Shriberg, E., Ferrer, L., Kajarekar, S., Venkataraman, A., Stolcke, A.: Modeling prosodic feature sequences for speaker recognition. Speech Commun. 46(3\u20134), 455\u2013472 (2005)","journal-title":"Speech Commun."},{"doi-asserted-by":"crossref","unstructured":"Siu, M.H., Yu, G., Gish, H.: An unsupervised, sequential learning algorithm for the segmentation of speech waveforms with multiple speakers. In: Proceedings of ICASSP, vol. 2, pp. 189\u2013192. IEEE (1992)","key":"14_CR35","DOI":"10.1109\/ICASSP.1992.226088"},{"doi-asserted-by":"crossref","unstructured":"Soldi, G., Bozonnet, S., Alegre, F., Beaugeant, C., Evans, N.: Short-duration speaker modelling with phone adaptive training. In: Proceedings of Odyssey Workshop, pp. 208\u2013215. ISCA (2014)","key":"14_CR36","DOI":"10.21437\/Odyssey.2014-32"},{"doi-asserted-by":"crossref","unstructured":"Sugiyama, M., Murakami, J., Watanabe, H.: Speech segmentation and clustering based on speaker features. In: Proceedings of ICASSP, vol. 2, pp. 395\u2013398. IEEE (1993)","key":"14_CR37","DOI":"10.1109\/ICASSP.1993.319322"},{"unstructured":"Wallis, J.: Grammatica linguae Anglicanae. L. Lichfield, Oxford (1674)","key":"14_CR38"},{"doi-asserted-by":"crossref","unstructured":"Wang, G., Wu, X., Zheng, T.: Using phoneme recognition and text-dependent speaker verification to improve speaker segmentation for Chinese speech. In: Proceedings of INTERSPEECH, pp. 1457\u20131460. ISCA (2010)","key":"14_CR39","DOI":"10.21437\/Interspeech.2010-148"},{"doi-asserted-by":"crossref","unstructured":"Wilcox, L., Chen, F., Kimber, D., Balasubramanian, V.: Segmentation of speech using speaker identification. In: Proceedings of ICASSP, vol. 1, pp. 161\u2013164. IEEE (1994)","key":"14_CR40","DOI":"10.1109\/ICASSP.1994.389330"},{"doi-asserted-by":"crossref","unstructured":"Yamada, M., Pezeshki, A., Azimi-Sadjadi, M.: Relation between kernel CCA and kernel FDA. In: Proceedings of IJCNN, pp. 226\u2013231. IEEE (2005)","key":"14_CR41","DOI":"10.1109\/IJCNN.2005.1555834"},{"doi-asserted-by":"crossref","unstructured":"Yella, S., Motl\u00edcek, P., Bourlard, H.: Phoneme background model for information bottleneck based speaker diarization. In: Proceedings of INTERSPEECH, pp. 597\u2013601. ISCA (2014)","key":"14_CR42","DOI":"10.1109\/ICASSP.2014.6853565"},{"doi-asserted-by":"crossref","unstructured":"Zibert, J., Mihelic, F.: Prosodic and phonetic features for speaker clustering in speaker diarization systems. In: Proceedings of INTERSPEECH, pp. 1033\u20131036. ISCA (2011)","key":"14_CR43","DOI":"10.21437\/Interspeech.2011-387"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-99579-3_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,30]],"date-time":"2022-08-30T04:53:55Z","timestamp":1661835235000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-99579-3_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783319995786","9783319995793"],"references-count":43,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-99579-3_14","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2018]]}}}