{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T18:49:59Z","timestamp":1755802199819,"version":"3.44.0"},"publisher-location":"Cham","reference-count":27,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783032025470"},{"type":"electronic","value":"9783032025487"}],"license":[{"start":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:00Z","timestamp":1755820800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:00Z","timestamp":1755820800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-02548-7_8","type":"book-chapter","created":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T05:39:41Z","timestamp":1755754781000},"page":"84-95","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["An Exploration of\u00a0ECAPA-TDNN and\u00a0x-vector Speaker Representations in\u00a0Zero-Shot Multi-speaker TTS"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7187-8481","authenticated-orcid":false,"given":"Marie","family":"Kune\u0161ov\u00e1","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4001-9289","authenticated-orcid":false,"given":"Zden\u011bk","family":"Hanzl\u00ed\u010dek","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7408-7730","authenticated-orcid":false,"given":"Jind\u0159ich","family":"Matou\u0161ek","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"8_CR1","doi-asserted-by":"publisher","unstructured":"Cai, W., Chen, J., Li, M.: Exploring the encoding layer and loss function in end-to-end speaker and language recognition system. In: The Speaker and Language Recognition Workshop (Odyssey 2018), pp. 74\u201381 (2018). https:\/\/doi.org\/10.21437\/Odyssey.2018-11","DOI":"10.21437\/Odyssey.2018-11"},{"key":"8_CR2","unstructured":"Casanova, E., Weber, J., Shulby, C.D., Candido\u00a0Jr., A., G\u00f6lge, E., Ponti, M.A.: YourTTS: towards zero-shot multi-speaker TTS and zero-shot voice conversion for everyone. In: Proceedings of the 39th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0162, pp. 2709\u20132720. PMLR (2022). https:\/\/proceedings.mlr.press\/v162\/casanova22a.html"},{"issue":"6","key":"8_CR3","doi-asserted-by":"publisher","first-page":"1505","DOI":"10.1109\/JSTSP.2022.3188113","volume":"16","author":"S Chen","year":"2022","unstructured":"Chen, S., et al.: WavLM: large-scale self-supervised pre-training for full stack speech processing. IEEE J. Sel. Top. Signal Process. 16(6), 1505\u20131518 (2022). https:\/\/doi.org\/10.1109\/JSTSP.2022.3188113","journal-title":"IEEE J. Sel. Top. Signal Process."},{"key":"8_CR4","doi-asserted-by":"publisher","unstructured":"Chung, J.S., Nagrani, A., Zisserman, A.: VoxCeleb2: deep speaker recognition. In: Interspeech 2018, pp. 1086\u20131090 (2018). https:\/\/doi.org\/10.21437\/Interspeech.2018-1929","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"8_CR5","doi-asserted-by":"publisher","unstructured":"Cooper, E., et al.: Zero-shot multi-speaker text-to-speech with state-of-the-art neural speaker embeddings. In: ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6184\u20136188 (2020). https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9054535","DOI":"10.1109\/ICASSP40776.2020.9054535"},{"key":"8_CR6","doi-asserted-by":"publisher","unstructured":"Dawalatabad, N., Ravanelli, M., Grondin, F., Thienpondt, J., Desplanques, B., Na, H.: ECAPA-TDNN embeddings for speaker diarization. In: Interspeech 2021, pp. 3560\u20133564 (2021). https:\/\/doi.org\/10.21437\/Interspeech.2021-941","DOI":"10.21437\/Interspeech.2021-941"},{"issue":"4","key":"8_CR7","doi-asserted-by":"publisher","first-page":"788","DOI":"10.1109\/TASL.2010.2064307","volume":"19","author":"N Dehak","year":"2011","unstructured":"Dehak, N., Kenny, P.J., Dehak, R., Dumouchel, P., Ouellet, P.: Front-end factor analysis for speaker verification. IEEE Trans. Audio Speech Lang. Process. 19(4), 788\u2013798 (2011). https:\/\/doi.org\/10.1109\/TASL.2010.2064307","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"8_CR8","doi-asserted-by":"publisher","unstructured":"Desplanques, B., Thienpondt, J., Demuynck, K.: ECAPA-TDNN: emphasized channel attention, propagation and aggregation in TDNN based speaker verification. In: Interspeech 2020, pp. 3830\u20133834 (2020). https:\/\/doi.org\/10.21437\/Interspeech.2020-2650","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"8_CR9","doi-asserted-by":"publisher","unstructured":"Doddipatla, R., Braunschweiler, N., Maia, R.: Speaker adaptation in DNN-based speech synthesis using D-vectors. In: Interspeech 2017, pp. 3404\u20133408 (2017). https:\/\/doi.org\/10.21437\/Interspeech.2017-1038","DOI":"10.21437\/Interspeech.2017-1038"},{"key":"8_CR10","doi-asserted-by":"publisher","unstructured":"Gusev, A., Avdeeva, A.: Improvement speaker similarity for zero-shot any-to-any voice conversion of whispered and regular speech. In: Interspeech 2024, pp. 2735\u20132739 (2024). https:\/\/doi.org\/10.21437\/Interspeech.2024-2091","DOI":"10.21437\/Interspeech.2024-2091"},{"key":"8_CR11","doi-asserted-by":"publisher","unstructured":"Heo, H.S., Lee, B.J., Huh, J., Chung, J.S.: Clova baseline system for the VoxCeleb Speaker Recognition Challenge 2020. arXiv preprint arXiv:2009.14153 (2020). https:\/\/doi.org\/10.48550\/arXiv.2009.14153","DOI":"10.48550\/arXiv.2009.14153"},{"key":"8_CR12","unstructured":"ITU-R Recommendation BS.1534-3: Method for the subjective assessment of intermediate quality level of audio systems. Technical report, International Telecommunication Union (2015)"},{"key":"8_CR13","doi-asserted-by":"publisher","unstructured":"Jeong, M., Kim, M., Kim, S., Kim, N.S.: Evidential-TTS: high fidelity zero-shot text-to-speech using evidential deep learning. In: ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135 (2025). https:\/\/doi.org\/10.1109\/ICASSP49660.2025.10889279","DOI":"10.1109\/ICASSP49660.2025.10889279"},{"key":"8_CR14","unstructured":"Kim, J., Kong, J., Son, J.: Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech. In: Proceedings of the 38th International Conference on Machine Learning, vol.\u00a0139, pp. 5530\u20135540. PMLR (2021). https:\/\/proceedings.mlr.press\/v139\/kim21f.html"},{"key":"8_CR15","doi-asserted-by":"publisher","unstructured":"Koluguri, N.R., Park, T., Ginsburg, B.: TitaNet: neural model for speaker representation with 1D depth-wise separable convolutions and global context. In: ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 8102\u20138106 (2022). https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9746806","DOI":"10.1109\/ICASSP43922.2022.9746806"},{"key":"8_CR16","doi-asserted-by":"publisher","unstructured":"Kuchaiev, O., et\u00a0al.: NeMo: a toolkit for building AI applications using neural modules. arXiv preprint arXiv:1909.09577 (2019). https:\/\/doi.org\/10.48550\/arXiv.1909.09577","DOI":"10.48550\/arXiv.1909.09577"},{"key":"8_CR17","doi-asserted-by":"publisher","unstructured":"Li, C., et al.: Deep speaker: an end-to-end neural speaker embedding system (2017). https:\/\/doi.org\/10.48550\/arXiv.1705.02304","DOI":"10.48550\/arXiv.1705.02304"},{"key":"8_CR18","doi-asserted-by":"publisher","unstructured":"Li, H., Zhu, X., Xue, L., Song, Y., Chen, Y., Xie, L.: SponTTS: modeling and transferring spontaneous style for TTS. In: ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 12171\u201312175 (2024). https:\/\/doi.org\/10.1109\/ICASSP48485.2024.10445828","DOI":"10.1109\/ICASSP48485.2024.10445828"},{"key":"8_CR19","doi-asserted-by":"publisher","unstructured":"Loweimi, E., Qian, M., Knill, K., Gales, M.: On the usefulness of speaker embeddings for speaker retrieval in the wild: a comparative study of x-vector and ECAPA-TDNN models. In: Interspeech 2024. pp. 3774\u20133778 (2024). https:\/\/doi.org\/10.21437\/Interspeech.2024-161","DOI":"10.21437\/Interspeech.2024-161"},{"key":"8_CR20","doi-asserted-by":"publisher","unstructured":"Nagrani, A., Chung, J.S., Zisserman, A.: VoxCeleb: a large-scale speaker identification dataset. In: Interspeech 2017, pp. 2616\u20132620 (2017). https:\/\/doi.org\/10.21437\/Interspeech.2017-950","DOI":"10.21437\/Interspeech.2017-950"},{"key":"8_CR21","doi-asserted-by":"publisher","unstructured":"Ravanelli, M., et\u00a0al.: SpeechBrain: a general-purpose speech toolkit. arXiv preprint arXiv:2106.04624 (2021). https:\/\/doi.org\/10.48550\/arXiv.2106.04624","DOI":"10.48550\/arXiv.2106.04624"},{"key":"8_CR22","doi-asserted-by":"publisher","unstructured":"Snyder, D., Garcia-Romero, D., Sell, G., Povey, D., Khudanpur, S.: X-vectors: robust DNN embeddings for speaker recognition. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5329\u20135333 (2018). https:\/\/doi.org\/10.1109\/ICASSP.2018.8461375","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"8_CR23","doi-asserted-by":"publisher","unstructured":"Variani, E., Lei, X., McDermott, E., Lopez\u00a0Moreno, I., Gonzalez-Dominguez, J.: Deep neural networks for small footprint text-dependent speaker verification. In: 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4052\u20134056 (2014). https:\/\/doi.org\/10.1109\/ICASSP.2014.6854363","DOI":"10.1109\/ICASSP.2014.6854363"},{"key":"8_CR24","doi-asserted-by":"publisher","unstructured":"Wan, L., Wang, Q., Papir, A., Lopez\u00a0Moreno, I.: Generalized end-to-end loss for speaker verification. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). pp. 4879\u20134883 (2018). https:\/\/doi.org\/10.1109\/ICASSP.2018.8462665","DOI":"10.1109\/ICASSP.2018.8462665"},{"key":"8_CR25","doi-asserted-by":"publisher","unstructured":"Xin, D., Saito, Y., Takamichi, S., Koriyama, T., Saruwatari, H.: Cross-lingual speaker adaptation using domain adaptation and speaker consistency loss for text-to-speech synthesis. In: Interspeech 2021, pp. 1614\u20131618 (2021). https:\/\/doi.org\/10.21437\/Interspeech.2021-897","DOI":"10.21437\/Interspeech.2021-897"},{"key":"8_CR26","doi-asserted-by":"publisher","unstructured":"Xue, J., Deng, Y., Han, Y., Li, Y., Sun, J., Liang, J.: ECAPA-TDNN for multi-speaker text-to-speech synthesis. In: 2022 13th International Symposium on Chinese Spoken Language Processing (ISCSLP), pp. 230\u2013234 (2022). https:\/\/doi.org\/10.1109\/ISCSLP57327.2022.10037956","DOI":"10.1109\/ISCSLP57327.2022.10037956"},{"key":"8_CR27","doi-asserted-by":"publisher","unstructured":"Yang, S., et\u00a0al.: SUPERB: speech processing universal PERformance benchmark. In: Interspeech 2021, pp. 1194\u20131198 (2021). https:\/\/doi.org\/10.21437\/Interspeech.2021-1775","DOI":"10.21437\/Interspeech.2021-1775"}],"container-title":["Lecture Notes in Computer Science","Text, Speech, and Dialogue"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-02548-7_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T05:39:44Z","timestamp":1755754784000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-02548-7_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,22]]},"ISBN":["9783032025470","9783032025487"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-02548-7_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025,8,22]]},"assertion":[{"value":"22 August 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"TSD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Text, Speech, and Dialogue","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Erlangen","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"tsd2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.kiv.zcu.cz\/tsd2025\/index.php","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}