{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,28]],"date-time":"2025-03-28T07:04:09Z","timestamp":1743145449256,"version":"3.40.3"},"publisher-location":"Cham","reference-count":32,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031483110"},{"type":"electronic","value":"9783031483127"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-48312-7_39","type":"book-chapter","created":{"date-parts":[[2023,11,21]],"date-time":"2023-11-21T20:03:21Z","timestamp":1700597001000},"page":"490-502","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Ensemble of\u00a0Incremental System Enhancements for\u00a0Robust Speaker Diarization in\u00a0Code-Switched Real-Life Audios"],"prefix":"10.1007","author":[{"given":"Raj","family":"Gohil","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ramya","family":"Viswanathan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Saurabh","family":"Agrawal","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"C. M.","family":"Vikram","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Madhu R.","family":"Kamble","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kamini","family":"Sabu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"M. Ali Basha","family":"Shaik","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Krishna K. S","family":"Rajesh","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,11,22]]},"reference":[{"key":"39_CR1","unstructured":"Displace challenge. https:\/\/codalab.lisn.upsaclay.fr\/competitions\/10588"},{"key":"39_CR2","unstructured":"Displace challenge evaluation plan. https:\/\/displace2023.github.io\/docs\/DISPLACE_Evaluation_Plan_v1.pdf"},{"key":"39_CR3","unstructured":"openslr.org. https:\/\/www.openslr.org\/28\/"},{"issue":"2","key":"39_CR4","doi-asserted-by":"publisher","first-page":"356","DOI":"10.1109\/TASL.2011.2125954","volume":"20","author":"X Anguera","year":"2012","unstructured":"Anguera, X., Bozonnet, S., Evans, N., Fredouille, C., Friedland, G., Vinyals, O.: Speaker diarization: a review of recent research. IEEE Trans. Audio Speech Lang. Process. 20(2), 356\u2013370 (2012)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"39_CR5","doi-asserted-by":"publisher","unstructured":"Chung, J.S., Nagrani, A., Zisserman, A.: VoxCeleb2: deep speaker recognition. In: Proceedings of the Interspeech 2018, pp. 1086\u20131090 (2018). https:\/\/doi.org\/10.21437\/Interspeech.2018-1929","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"39_CR6","doi-asserted-by":"publisher","unstructured":"Dawalatabad, N., Ravanelli, M., Grondin, F., Thienpondt, J., Desplanques, B., Na, H.: ECAPA-TDNN embeddings for speaker diarization. In: Proceedings of the Interspeech 2021, pp. 3560\u20133564 (2021). https:\/\/doi.org\/10.21437\/Interspeech.2021-941","DOI":"10.21437\/Interspeech.2021-941"},{"key":"39_CR7","doi-asserted-by":"publisher","unstructured":"Desplanques, B., Thienpondt, J., Demuynck, K.: ECAPA-TDNN: emphasized channel attention, propagation and aggregation in TDNN based speaker verification. In: Proceedings of the Interspeech 2020, pp. 3830\u20133834 (2020). https:\/\/doi.org\/10.21437\/Interspeech.2020-2650","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"39_CR8","doi-asserted-by":"crossref","unstructured":"Raj, D., et al.: DOVER-Lap: a method for combining overlap-aware diarization outputs. In: 2021 IEEE Spoken Language Technology Workshop (SLT) (2021)","DOI":"10.1109\/SLT48900.2021.9383490"},{"issue":"2","key":"39_CR9","doi-asserted-by":"publisher","first-page":"652","DOI":"10.1109\/TPAMI.2019.2938758","volume":"43","author":"SH Gao","year":"2019","unstructured":"Gao, S.H., Cheng, M.M., Zhao, K., Zhang, X.Y., Yang, M.H., Torr, P.: Res2net: a new multi-scale backbone architecture. IEEE Trans. Pattern Anal. Mach. Intell. 43(2), 652\u2013662 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"39_CR10","doi-asserted-by":"crossref","unstructured":"Graves, A., Graves, A.: Long short-term memory. In: Supervised Sequence Labelling With Recurrent Neural Networks, pp. 37\u201345 (2012)","DOI":"10.1007\/978-3-642-24797-2_4"},{"key":"39_CR11","doi-asserted-by":"crossref","unstructured":"Gudepu, P., Koroth, M.J., Sabu, K., Shaik, M.A.B.: Dynamic encoder RNN for online voice activity detection in adverse noise conditions. In: Interspeech (Accepted for Publication). Dublin, Ireland (2023)","DOI":"10.21437\/Interspeech.2023-2466"},{"key":"39_CR12","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"39_CR13","unstructured":"Hermans, A., Beyer, L., Leibe, B.: In defense of the triplet loss for person re-identification. arXiv preprint arXiv:1703.07737 (2017)"},{"key":"39_CR14","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., Sun, G.: Squeeze-and-excitation networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7132\u20137141 (2018)","DOI":"10.1109\/CVPR.2018.00745"},{"key":"39_CR15","doi-asserted-by":"crossref","unstructured":"Ko, T., Peddinti, V., Povey, D., Seltzer, M.L., Khudanpur, S.: A study on data augmentation of reverberant speech for robust speech recognition. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5220\u20135224. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7953152"},{"key":"39_CR16","doi-asserted-by":"crossref","unstructured":"Li, T.W., Lee, G.C.: Performance analysis of fine-tune transferred deep learning. In: IEEE 3rd Eurasia Conference on IOT, Communication and Engineering (ECICE), pp. 315\u2013319 (2021)","DOI":"10.1109\/ECICE52819.2021.9645649"},{"key":"39_CR17","doi-asserted-by":"publisher","unstructured":"Lin, Q., Yin, R., Li, M., Bredin, H., Barras, C.: LSTM based similarity measurement with spectral clustering for speaker diarization. In: Proceedings Interspeech 2019, pp. 366\u2013370 (2019). https:\/\/doi.org\/10.21437\/Interspeech.2019-1388","DOI":"10.21437\/Interspeech.2019-1388"},{"key":"39_CR18","doi-asserted-by":"publisher","first-page":"395","DOI":"10.1007\/s11222-007-9033-z","volume":"17","author":"U von Luxburg","year":"2007","unstructured":"von Luxburg, U.: A tutorial on spectral clustering. Stat. Comput. 17, 395\u2013416 (2007)","journal-title":"Stat. Comput."},{"key":"39_CR19","doi-asserted-by":"crossref","unstructured":"Lyu, D.C., Chng, E.S., Li, H.: Language diarization for code-switch conversational speech. In: 2013 IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 7314\u20137318. IEEE (2013)","DOI":"10.1109\/ICASSP.2013.6639083"},{"key":"39_CR20","doi-asserted-by":"publisher","unstructured":"Nagrani, A., Chung, J.S., Zisserman, A.: VoxCeleb: a large-scale speaker identification dataset. In: Proceedings of the Interspeech 2017, pp. 2616\u20132620 (2017). https:\/\/doi.org\/10.21437\/Interspeech.2017-950","DOI":"10.21437\/Interspeech.2017-950"},{"key":"39_CR21","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2021.101317","volume":"72","author":"TJ Park","year":"2022","unstructured":"Park, T.J., Kanda, N., Dimitriadis, D., Han, K.J., Watanabe, S., Narayanan, S.: A review of speaker diarization: recent advances with deep learning. Comput. Speech Lang. 72, 101317 (2022)","journal-title":"Comput. Speech Lang."},{"key":"39_CR22","doi-asserted-by":"crossref","unstructured":"Prabhavalkar, R., Hori, T., Sainath, T.N., Schl\u00fcter, R., Watanabe, S.: End-to-end speech recognition: a survey. arXiv (2023). arXiv:2303.03329","DOI":"10.1109\/TASLP.2023.3328283"},{"key":"39_CR23","doi-asserted-by":"publisher","unstructured":"Ryant, N., et al.: The third DIHARD diarization challenge. In: Proceedings of the Interspeech 2021, pp. 3570\u20133574 (2021). https:\/\/doi.org\/10.21437\/Interspeech.2021-1208","DOI":"10.21437\/Interspeech.2021-1208"},{"issue":"1","key":"39_CR24","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1109\/MSP.2016.2617341","volume":"34","author":"R Sarikaya","year":"2017","unstructured":"Sarikaya, R.: The technology behind personal digital assistants: an overview of the system architecture and key components. IEEE Signal Process. Mag. 34(1), 67\u201381 (2017). https:\/\/doi.org\/10.1109\/MSP.2016.2617341","journal-title":"IEEE Signal Process. Mag."},{"key":"39_CR25","doi-asserted-by":"publisher","unstructured":"Sertsi, P., Boonkla, S., Chunwijitra, V., Kurpukdee, N., Wutiwiwatchai, C.: Robust voice activity detection based on LSTM recurrent neural networks and modulation spectrum. In: Proceedings of the APSIPA ASC, pp. 342\u2013346 (2017). https:\/\/doi.org\/10.21437\/Interspeech.2021-941","DOI":"10.21437\/Interspeech.2021-941"},{"key":"39_CR26","doi-asserted-by":"crossref","unstructured":"Shankarappa, R., Tiwari, S.: A faster approach for direct speech to speech translation. In: IEEE WINTECHCON. Bangalore (2022)","DOI":"10.1109\/WINTECHCON55229.2022.9832314"},{"key":"39_CR27","unstructured":"Snyder, D., Chen, G., Povey, D.: MUSAN: a music, speech, and noise corpus. arXiv preprint arXiv:1510.08484 (2015)"},{"key":"39_CR28","doi-asserted-by":"crossref","unstructured":"Snyder, D., Garcia-Romero, D., Sell, G., Povey, D., Khudanpur, S.: X-vectors: robust DNN embeddings for speaker recognition. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5329\u20135333. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"39_CR29","unstructured":"Team, S.: Silero VAD: pre-trained enterprise-grade voice activity detector (VAD), number detector and language classifier. https:\/\/github.com\/snakers4\/silero-vad (2021)"},{"key":"39_CR30","doi-asserted-by":"crossref","unstructured":"Variani, E., Lei, X., McDermott, E., Moreno, I.L., Gonzalez-Dominguez, J.: Deep neural networks for small footprint text-dependent speaker verification. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4052\u20134056 (2014)","DOI":"10.1109\/ICASSP.2014.6854363"},{"key":"39_CR31","unstructured":"Zeinali, H., Wang, S., Silnova, A., Mat\u011bjka, P., Plchot, O.: BUT system description to VoxCceleb speaker recognition challenge 2019. arXiv preprint arXiv:1910.12592 (2019)"},{"key":"39_CR32","doi-asserted-by":"publisher","first-page":"3427","DOI":"10.1109\/TASLP.2021.3125142","volume":"29","author":"Y Zhou","year":"2021","unstructured":"Zhou, Y., Tian, X., Li, H.: Language agnostic speaker embedding for cross-lingual personalized speech generation. IEEE\/ACM Trans. Audio Speech Lang. Process. 29, 3427\u20133439 (2021). https:\/\/doi.org\/10.1109\/TASLP.2021.3125142","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-48312-7_39","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,21]],"date-time":"2023-11-21T20:08:19Z","timestamp":1700597299000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-48312-7_39"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031483110","9783031483127"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-48312-7_39","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"22 November 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"SPECOM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Speech and Computer","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Dharwad","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 November 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 December 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"specom2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.iitdh.ac.in\/specom-2023\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Easychair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"174","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"94","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"54% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}