{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T15:51:13Z","timestamp":1763740273414,"version":"3.45.0"},"reference-count":32,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T00:00:00Z","timestamp":1763683200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T00:00:00Z","timestamp":1763683200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"crossref","award":["2021"],"award-info":[{"award-number":["2021"]}],"id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J AUDIO SPEECH MUSIC PROC."],"DOI":"10.1186\/s13636-025-00431-4","type":"journal-article","created":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T15:22:23Z","timestamp":1763738543000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Speaker embedding loss for end-to-end speaker diarization without external embedding networks"],"prefix":"10.1186","volume":"2025","author":[{"given":"Jaehee","family":"Jung","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wooil","family":"Kim","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,11,21]]},"reference":[{"issue":"4","key":"431_CR1","doi-asserted-by":"publisher","first-page":"788","DOI":"10.1109\/TASL.2010.2064307","volume":"19","author":"N Dehak","year":"2010","unstructured":"N. Dehak, P.J. Kenny, R. Dehak, P. Dumouchel, P. Ouellet, Front-end factor analysis for speaker verification. IEEE Trans. Audio Speech Lang. Process. 19(4), 788\u2013798 (2010)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"431_CR2","doi-asserted-by":"crossref","unstructured":"E. Variani, X. Lei, E. McDermott, I. L. Moreno, J. Gonzalez-Dominguez: Deep neural networks for small footprint text-dependent speaker verification. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2014). Florence, Italy; May 2014: 4052\u20134056.","DOI":"10.1109\/ICASSP.2014.6854363"},{"key":"431_CR3","doi-asserted-by":"crossref","unstructured":"D. Snyder, D. Garcia-Romero, D. Povey, S. Khudanpur: Deep neural network embeddings for text-independent speaker verification. Interspeech 2017. Stockholm, Sweden; August 2017: 999\u20131003.","DOI":"10.21437\/Interspeech.2017-620"},{"key":"431_CR4","doi-asserted-by":"crossref","unstructured":"D. Garcia-Romero, D. Snyder, G. Sell, D. Povey, A. McCree: Speaker diarization using deep neural network embeddings. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2017). New Orleans, USA; March 2017: 4930\u20134934.","DOI":"10.1109\/ICASSP.2017.7953094"},{"key":"431_CR5","doi-asserted-by":"crossref","unstructured":"G. Sell, D. Garcia-Romero: Diarization resegmentation in the factor analysis subspace. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2015). Brisbane, Australia; April 2015: 4794\u20134798.","DOI":"10.1109\/ICASSP.2015.7178881"},{"key":"431_CR6","doi-asserted-by":"crossref","unstructured":"Q. Wang, C. Downey, L. Wan, P. A. Mansfield, I. L. Moreno: Speaker diarization with LSTM. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2018). Calgary, Canada; April 2018: 5239\u20135243.","DOI":"10.1109\/ICASSP.2018.8462628"},{"key":"431_CR7","doi-asserted-by":"crossref","unstructured":"M. Diez, L. Burget, S. Wang, J. Rohdin, J. Cernock\u00fd: Bayesian HMM Based x-Vector Clustering for Speaker Diarization. Interspeech 2019. Graz, Austria; Sept. 2019: 346\u2013350.","DOI":"10.21437\/Interspeech.2019-2813"},{"key":"431_CR8","doi-asserted-by":"crossref","unstructured":"D. Raj, Z. Huang, S. Khudanpur: Multi-Class Spectral Clustering with Overlaps for Speaker Diarization. IEEE Spoken Language Technology Workshop (SLT 2021). Shenzhen, China; Jan. 2021: 582\u2013589.","DOI":"10.1109\/SLT48900.2021.9383602"},{"key":"431_CR9","doi-asserted-by":"crossref","unstructured":"K. Kinoshita, M. Delcroix, N. Tawara: Integrating end-to-end neural and clustering-based diarization: Getting the best of both worlds. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2021). Toronto, Canada; June 2021: 7198\u20137202.","DOI":"10.1109\/ICASSP39728.2021.9414333"},{"key":"431_CR10","doi-asserted-by":"crossref","unstructured":"Y. Fujita, N. Kanda, S. Horiguchi, Y. Xue, K. Nagamatsu, S. Watanabe: End-to-end neural speaker diarization with self-attention. IEEE Automatic Speech Recognition and Understanding Workshop (ASRU 2019). Sentosa, Singapore; Dec. 2019: 296\u2013303.","DOI":"10.1109\/ASRU46091.2019.9003959"},{"key":"431_CR11","doi-asserted-by":"crossref","unstructured":"I. Medennikov, M. Korenevsky, T. Prisyach, Y. Khokhlov, M. Korenevskaya, I. Sorokin, T. Timofeeva, A. Mitrofanov, A. Andrusenko, I. Podluzhny, A. Laptev, A. Romanenko: Target-speaker voice activity detection: a novel approach for multi-speaker diarization in a dinner party scenario. Interspeech 2020. Shanghai, China; Oct. 2020: 3570\u20133574.","DOI":"10.21437\/Interspeech.2020-1602"},{"key":"431_CR12","doi-asserted-by":"crossref","unstructured":"Y. C. Liu, E. Han, C. Lee, A. Stolcke: End-to-end neural diarization: From transformer to conformer. , Interspeech 2021. Brno, Czechia; Sept. 2021: 3081\u20133085.","DOI":"10.21437\/Interspeech.2021-1909"},{"key":"431_CR13","doi-asserted-by":"crossref","unstructured":"S. Horiguchi, Y. Fujita, S. Watanabe, Y. Xue, K. Nagamatsu: End-to-end speaker diarization for an unknown number of speakers with encoder-decoder based attractors. Interspeech 2020. Shanghai, China; Oct. 2020: 269\u2013273.","DOI":"10.21437\/Interspeech.2020-1022"},{"key":"431_CR14","unstructured":"Y. Fujita, S. Watanabe, S. Horiguchi, Y. Xue, J. Shi, K. Nagamatsu: Neural speaker diarization with speaker-wise chain rule. arXiv preprint arXiv:2006.01796. 2020. https:\/\/arxiv.org\/abs\/2006.01796"},{"key":"431_CR15","doi-asserted-by":"crossref","unstructured":"S. Maiti, H. Erdogan, K. Wilson, S. Wisdom, S. Watanabe, J. R. Hershey: End-to-end diarization for variable number of speakers with local-global networks and discriminative speaker embeddings. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2021). Toronto, Canada; June 2021: 7183\u20137187.","DOI":"10.1109\/ICASSP39728.2021.9414841"},{"key":"431_CR16","doi-asserted-by":"crossref","unstructured":"Y. Ueda, S. Maiti, S. Watanabe, C. Zhang, M. Yu, S. X. Zhang, Y. Xu: EEND-SS: Joint End-to-End Neural Speaker Diarization and Speech Separation for Flexible Number of Speakers. IEEE Spoken Language Technology Workshop (SLT 2022). Doha, Qatar; Jan. 2023: 480\u2013487.","DOI":"10.1109\/SLT54892.2023.10022924"},{"key":"431_CR17","doi-asserted-by":"crossref","unstructured":"Z. Du, S. Zhang, S. Zheng, Z. Yan: Speaker Overlap-aware Neural Diarization for Multi-party Meeting Analysis. 2022 Conference on Empirical Methods in Natural Language Processing (EMNLP 2022). Abu Dhabi, UAE; Dec. 2022: 7458\u20137469.","DOI":"10.18653\/v1\/2022.emnlp-main.505"},{"key":"431_CR18","doi-asserted-by":"crossref","unstructured":"K. Kinoshita, M. Delcroix, T. Iwata: Tight integration of neural-and clustering-based diarization through deep unfolding of infinite gaussian mixture model. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2022). Singapore; May 2022: 8382\u20138386.","DOI":"10.1109\/ICASSP43922.2022.9746234"},{"key":"431_CR19","doi-asserted-by":"publisher","DOI":"10.3390\/s25051509","volume":"25","author":"J Kotus","year":"2025","unstructured":"J. Kotus, G. Szwoch, Separation of simultaneous speakers with acoustic vector sensor. Sensors 25, 1509 (2025). https:\/\/doi.org\/10.3390\/s25051509","journal-title":"Sensors"},{"key":"431_CR20","doi-asserted-by":"crossref","unstructured":"Y. Fujita, N. Kanda, S. Horiguchi, K. Nagamatsu, S. Watanabe: End-to-end neural speaker diarization with permutation-free objectives. Interspeech 2019. Graz, Austria; Sept. 2019: 4300\u20134304.","DOI":"10.21437\/Interspeech.2019-2899"},{"key":"431_CR21","doi-asserted-by":"crossref","unstructured":"K. Kinoshita, M. Delcroix, N. Tawara: Advances in integration of end-to-end neural and clustering-based diarization for real conversational speech. Interspeech 2021. Brno, Czechia; Sept. 2021: 3565\u20133569.","DOI":"10.21437\/Interspeech.2021-1004"},{"key":"431_CR22","doi-asserted-by":"crossref","unstructured":"S. Horiguchi, S. Watanabe, P. Garc\u00eda, Y. Xue, Y. Takashima, Y. Kawaguchi: Towards neural diarization for unlimited numbers of speakers using global and local attractors. IEEE Automatic Speech Recognition and Understanding Workshop (ASRU 2021). Cartagena, Colombia; Dec. 2021: 98\u2013105.","DOI":"10.1109\/ASRU51503.2021.9687875"},{"key":"431_CR23","doi-asserted-by":"crossref","unstructured":"V. Panayotov, G. Chen, D. Povey, S. Khudanpur, Librispeech: an ASR corpus based on public domain audio books. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2015). Brisbane, Australia; April 2015: 5206\u20135210.","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"431_CR24","unstructured":"D. Snyder, G. Chen, D. Povey, Musan: A music, speech, and noise corpus. arXiv preprint arXiv:1510.08484. 2015. https:\/\/arxiv.org\/abs\/1510.08484"},{"key":"431_CR25","doi-asserted-by":"crossref","unstructured":"T. Ko, V. Peddinti, D. Povey, M. L. Seltzer, S. Khudanpur: A study on data augmentation of reverberant speech for robust speech recognition. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2017). New Orleans, USA; March 2017: 5220\u20135224.","DOI":"10.1109\/ICASSP.2017.7953152"},{"key":"431_CR26","doi-asserted-by":"crossref","unstructured":"J. Carletta, S. Ashby, S. Bourban, M. Flynn, M. Guillemot, T. Hain, P. Wellner: The AMI meeting corpus: A pre-announcement. International Workshop on Machine Learning for Multimodal Interaction (MLMI 2005). Edinburgh, UK; July 2005: 28\u201339.","DOI":"10.1007\/11677482_3"},{"key":"431_CR27","doi-asserted-by":"crossref","unstructured":"A. Janin, D. Baron, J. Edwards, D. Ellis, D. Gelbart, N. Morgan, C. Wooters: The ICSI meeting corpus. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2003). Hong Kong, China; April 2003: 364\u2013367.","DOI":"10.1109\/ICASSP.2003.1198793"},{"key":"431_CR28","doi-asserted-by":"crossref","unstructured":"H. Bredin, R. Yin, J. M. Coria, G. Gelly, P. Korshunov, M. Lavechin, M. P. Gill: Pyannote. audio: neural building blocks for speaker diarization. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2020). Barcelona, Spain; May 2020: 7124\u20137128.","DOI":"10.1109\/ICASSP40776.2020.9052974"},{"key":"431_CR29","unstructured":"J. G. Fiscus, J. Ajot, J. S. Garofolo: The rich transcription 2007 meeting recognition evaluation. International Evaluation Workshop on Classification of Events, Activities and Relationships (CLEAR 2006). Southampton, UK; April 2006: 373\u2013389."},{"key":"431_CR30","unstructured":"M. Przybocki and M. Alvin, 2000 NIST Speaker Recognition Evaluation. 2001. [Online] (accessed Oct. 12, 2025) https:\/\/catalog.ldc.upenn.edy\/LDC2001S97"},{"key":"431_CR31","unstructured":"Kaldi recipe [Online]. (accessed Oct. 12, 2025). https:\/\/github.com\/kaldi-asr\/kaldi\/blob\/master\/egs\/callhome_diarization\/v2\/run.sh"},{"key":"431_CR32","doi-asserted-by":"crossref","unstructured":"C. Wang, J. Li, X. Fang, J. Kang and Y. Li: End-to-End Neural Speaker Diarization with Absolute Speaker Loss. Interspeech 2023. Dublin, Ireland; Aug. 2023: 3577\u20133581.","DOI":"10.21437\/Interspeech.2023-656"}],"container-title":["EURASIP Journal on Audio, Speech, and Music Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s13636-025-00431-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1186\/s13636-025-00431-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s13636-025-00431-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T15:22:27Z","timestamp":1763738547000},"score":1,"resource":{"primary":{"URL":"https:\/\/asmp-eurasipjournals.springeropen.com\/articles\/10.1186\/s13636-025-00431-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,21]]},"references-count":32,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["431"],"URL":"https:\/\/doi.org\/10.1186\/s13636-025-00431-4","relation":{},"ISSN":["1687-4722"],"issn-type":[{"value":"1687-4722","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,21]]},"assertion":[{"value":"29 April 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 October 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 November 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"40"}}