{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,20]],"date-time":"2025-06-20T16:28:13Z","timestamp":1750436893705,"version":"3.40.3"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031223204"},{"type":"electronic","value":"9783031223211"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-22321-1_17","type":"book-chapter","created":{"date-parts":[[2022,11,30]],"date-time":"2022-11-30T09:06:15Z","timestamp":1669799175000},"page":"246-259","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Fine-Tuned Self-supervised Speech Representations for\u00a0Language Diarization in\u00a0Multilingual Code-Switched Speech"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6107-3858","authenticated-orcid":false,"given":"Geoffrey","family":"Frost","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1336-3903","authenticated-orcid":false,"given":"Emily","family":"Morris","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3406-4788","authenticated-orcid":false,"given":"Joshua","family":"Jansen van V\u00fcren","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7341-1017","authenticated-orcid":false,"given":"Thomas","family":"Niesler","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,11,28]]},"reference":[{"key":"17_CR1","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. In: Advances in Neural Information Processing Systems, vol. 33, pp. 12449\u201312460 (2020)"},{"key":"17_CR2","unstructured":"Brummer, N.: Measuring, refining and calibrating speaker and language information extracted from speech. Ph.D. thesis, University of Stellenbosch, Stellenbosch (2010)"},{"key":"17_CR3","doi-asserted-by":"crossref","unstructured":"Cai, W., Cai, Z., Liu, W., Wang, X., Li, M.: Insights in-to-end learning scheme for language identification. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5209\u20135213 (2018)","DOI":"10.1109\/ICASSP.2018.8462026"},{"key":"17_CR4","doi-asserted-by":"crossref","unstructured":"Chen, G., et al.: Gigaspeech: an evolving, multi-domain ASR corpus with 10,000 hours of transcribed audio. In: Proceedings of Interspeech (2021)","DOI":"10.21437\/Interspeech.2021-1965"},{"key":"17_CR5","doi-asserted-by":"publisher","first-page":"1505","DOI":"10.1109\/JSTSP.2022.3188113","volume":"6","author":"S Chen","year":"2022","unstructured":"Chen, S., et al.: WavLM: large-scale self-supervised pre-training for full stack speech processing. IEEE J. Sel. Topics Signal Process. 6, 1505\u20131518 (2022)","journal-title":"IEEE J. Sel. Topics Signal Process."},{"key":"17_CR6","doi-asserted-by":"crossref","unstructured":"Chi, Z., et al.: XLM-E: cross-lingual language model pre-training via electra. arXiv preprint arXiv:2106.16138 (2021)","DOI":"10.18653\/v1\/2022.acl-long.427"},{"key":"17_CR7","doi-asserted-by":"crossref","unstructured":"Fujita, Y., Kanda, N., Horiguchi, S., Nagamatsu, K., Watanabe, S.: End-to-end neural speaker diarization with permutation-free objectives. In: Proceedings of Interspeech (2019)","DOI":"10.21437\/Interspeech.2019-2899"},{"key":"17_CR8","doi-asserted-by":"crossref","unstructured":"Gelly, G., Gauvain, J.L.: Spoken language identification using LSTM-based angular proximity. In: Proceedings of Interspeech, pp. 2566\u20132570 (2017)","DOI":"10.21437\/Interspeech.2017-1334"},{"key":"17_CR9","doi-asserted-by":"crossref","unstructured":"Geng, W., et al.: End-to-end language identification using attention-based recurrent neural networks. In: Proceedings of Interspeech, pp. 2944\u20132948 (2016)","DOI":"10.21437\/Interspeech.2016-686"},{"key":"17_CR10","doi-asserted-by":"publisher","first-page":"49","DOI":"10.1016\/j.neunet.2014.08.006","volume":"64","author":"J Gonzalez-Dominguez","year":"2015","unstructured":"Gonzalez-Dominguez, J., Lopez-Moreno, I., Moreno, P.J., Gonzalez-Rodriguez, J.: Frame-by-frame language identification in short utterances using deep neural networks. Neural Netw. 64, 49\u201358 (2015)","journal-title":"Neural Netw."},{"key":"17_CR11","doi-asserted-by":"crossref","unstructured":"Hershey, J.R., Chen, Z., Le Roux, J., Watanabe, S.: Deep clustering: discriminative embeddings for segmentation and separation. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 31\u201335. IEEE (2016)","DOI":"10.1109\/ICASSP.2016.7471631"},{"key":"17_CR12","doi-asserted-by":"crossref","unstructured":"Hieronymus, J.L., Kadambe, S.: Spoken language identification using large vocabulary speech recognition. In: Proceedings of Fourth International Conference on Spoken Language Processing (ICSLP), pp. 1780\u20131783 (1996)","DOI":"10.21437\/ICSLP.1996-452"},{"key":"17_CR13","doi-asserted-by":"publisher","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","volume":"29","author":"WN Hsu","year":"2021","unstructured":"Hsu, W.N., Bolte, B., Tsai, Y.H.H., Lakhotia, K., Salakhutdinov, R., Mohamed, A.: Hubert: self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM Trans. Audio Speech Lang. Process. 29, 3451\u20133460 (2021)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"17_CR14","doi-asserted-by":"crossref","unstructured":"Kahn, J., et al.: LIBRI-LIGHT: a benchmark for ASR with limited or no supervision. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 7669\u20137673. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"issue":"5","key":"17_CR15","doi-asserted-by":"publisher","first-page":"1136","DOI":"10.1109\/JPROC.2012.2237151","volume":"101","author":"H Li","year":"2013","unstructured":"Li, H., Ma, B., Lee, K.A.: Spoken language recognition: from fundamentals to practice. Proc. IEEE 101(5), 1136\u20131159 (2013)","journal-title":"Proc. IEEE"},{"key":"17_CR16","doi-asserted-by":"crossref","unstructured":"Liu, H., et al.: End-to-end language diarization for bilingual code-switching speech. In: Proceedings of Interspeech, pp. 1489\u20131493 (2021)","DOI":"10.21437\/Interspeech.2021-82"},{"key":"17_CR17","doi-asserted-by":"publisher","first-page":"46","DOI":"10.1016\/j.csl.2016.03.001","volume":"40","author":"I Lopez-Moreno","year":"2016","unstructured":"Lopez-Moreno, I., Gonzalez-Dominguez, J., Martinez, D., Plchot, O., Gonzalez-Rodriguez, J., Moreno, P.J.: On the use of deep feedforward neural networks for automatic language identification. Comput. Speech Lang. 40, 46\u201359 (2016)","journal-title":"Comput. Speech Lang."},{"key":"17_CR18","doi-asserted-by":"crossref","unstructured":"Mendoza, S., Gillick, L., Ito, Y., Lowe, S., Newman, M.: Automatic language identification using large vocabulary continuous speech recognition. In: Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), pp. 785\u2013788 (1996)","DOI":"10.1109\/ICASSP.1996.543238"},{"issue":"4","key":"17_CR19","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1109\/79.317925","volume":"11","author":"YK Muthusamy","year":"1994","unstructured":"Muthusamy, Y.K., Barnard, E., Cole, R.A.: Reviewing automatic language identification. IEEE Signal Process. Mag. 11(4), 33\u201341 (1994)","journal-title":"IEEE Signal Process. Mag."},{"key":"17_CR20","doi-asserted-by":"crossref","unstructured":"Muthusamy, Y.K., Jain, N., Cole, R.A.: Perceptual benchmarks for automatic language identification. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. I-333 (1994)","DOI":"10.1109\/ICASSP.1994.389288"},{"key":"17_CR21","doi-asserted-by":"crossref","unstructured":"Nakagawa, S., Ueda, Y., Seino, T.: Speaker-independent, text-independent language identification by HMM. In: Proceedings of Second International Conference on Spoken Language Processing (1992)","DOI":"10.21437\/ICSLP.1992-310"},{"issue":"1","key":"17_CR22","doi-asserted-by":"publisher","first-page":"512","DOI":"10.1121\/1.424522","volume":"105","author":"F Ramus","year":"1999","unstructured":"Ramus, F., Mehler, J.: Language identification with suprasegmental cues: a study based on speech resynthesis. J. Acoust. Soc. Am. 105(1), 512\u2013521 (1999)","journal-title":"J. Acoust. Soc. Am."},{"key":"17_CR23","doi-asserted-by":"crossref","unstructured":"Schultz, T., Rogina, I., Waibel, A.: LVCSR-based language identification. In: Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), pp. 781\u2013784 (1996)","DOI":"10.1109\/ICASSP.1996.543237"},{"key":"17_CR24","doi-asserted-by":"crossref","unstructured":"Trong, T.N., Hautam\u00e4ki, V., Lee, K.A.: Deep language: a comprehensive deep learning approach to end-to-end language recognition. In: Proceedings of Odyssey: The Speaker and Language Recognition Workshop, vol. 2016, pp. 109\u2013116 (2016)","DOI":"10.21437\/Odyssey.2016-16"},{"key":"17_CR25","unstructured":"Van Dulm, O.: The grammar of English-Afrikaans code switching: a feature checking account. Ph.D. thesis, External Organizations (2007)"},{"key":"17_CR26","doi-asserted-by":"crossref","unstructured":"Van Leeuwen, D.A., Brummer, N.: Channel-dependent GMM and multi-class logistic regression models for language recognition. In: Proceedings of Odyssey: The Speaker and Language Recognition Workshop, pp. 1\u20138 (2006)","DOI":"10.1109\/ODYSSEY.2006.248094"},{"key":"17_CR27","unstructured":"Van Leeuwen, D.A., De Boer, M., Orr, R.: A human benchmark for the NIST language recognition evaluation 2005. In: Proceedings of Odyssey: The Speaker and Language Recognition Workshop, p. 12 (2008)"},{"key":"17_CR28","doi-asserted-by":"crossref","unstructured":"Wang, C., et al.: VoxPopuli: a large-scale multilingual speech corpus for representation learning, semi-supervised learning and interpretation. arXiv preprint arXiv:2101.00390 (2021)","DOI":"10.18653\/v1\/2021.acl-long.80"},{"key":"17_CR29","doi-asserted-by":"crossref","unstructured":"Watanabe, S., Hori, T., Hershey, J.R.: Language independent end-to-end architecture for joint language identification and speech recognition. In: Proceedings of IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp. 265\u2013271 (2017)","DOI":"10.1109\/ASRU.2017.8268945"},{"key":"17_CR30","unstructured":"van der Westhuizen, E., Niesler, T.: A first South African corpus of multilingual code-switched soap opera speech. In: Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC) (2018)"},{"key":"17_CR31","unstructured":"Yan, Y.: Development of an approach to language identification based on language-dependent phone recognition. Oregon Graduate Institute of Science and Technology (1995)"},{"key":"17_CR32","doi-asserted-by":"crossref","unstructured":"Yang, S.W., et al.: Superb: speech processing universal performance benchmark. In: Proceedings of Interspeech (2021)","DOI":"10.21437\/Interspeech.2021-1775"},{"issue":"3","key":"17_CR33","doi-asserted-by":"publisher","first-page":"624","DOI":"10.1016\/j.neuroimage.2008.07.025","volume":"43","author":"J Zhao","year":"2008","unstructured":"Zhao, J., Shu, H., Zhang, L., Wang, X., Gong, Q., Li, P.: Cortical competition during language discrimination. Neuroimage 43(3), 624\u2013633 (2008)","journal-title":"Neuroimage"},{"issue":"1","key":"17_CR34","doi-asserted-by":"publisher","first-page":"31","DOI":"10.1109\/TSA.1996.481450","volume":"4","author":"MA Zissman","year":"1996","unstructured":"Zissman, M.A.: Comparison of four approaches to automatic language identification of telephone speech. IEEE Trans. Speech Audio Process. 4(1), 31 (1996)","journal-title":"IEEE Trans. Speech Audio Process."}],"container-title":["Communications in Computer and Information Science","Artificial Intelligence Research"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-22321-1_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,9]],"date-time":"2024-10-09T18:35:01Z","timestamp":1728498901000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-22321-1_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031223204","9783031223211"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-22321-1_17","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"type":"print","value":"1865-0929"},{"type":"electronic","value":"1865-0937"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"28 November 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"SACAIR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Southern African Conference for Artificial Intelligence Research","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Stellenbosch","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"South Africa","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 December 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"3","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"sacair2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/sacair.org.za\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"73","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"26","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"36% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}