{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T21:16:41Z","timestamp":1757625401233,"version":"3.44.0"},"publisher-location":"Cham","reference-count":17,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783032025470"},{"type":"electronic","value":"9783032025487"}],"license":[{"start":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:00Z","timestamp":1755820800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:00Z","timestamp":1755820800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-02548-7_1","type":"book-chapter","created":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T05:38:51Z","timestamp":1755754731000},"page":"3-12","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Lightweight Target-Speaker-Based Overlap Transcription for\u00a0Practical Streaming ASR"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9453-0034","authenticated-orcid":false,"given":"Ale\u0161","family":"Pra\u017e\u00e1k","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7187-8481","authenticated-orcid":false,"given":"Marie","family":"Kune\u0161ov\u00e1","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0764-3207","authenticated-orcid":false,"given":"Josef","family":"Psutka","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"1_CR1","unstructured":"Baevski, A., Zhou, H., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. In: Advances in Neural Information Processing Systems (NeurIPS) (2020)"},{"key":"1_CR2","doi-asserted-by":"crossref","unstructured":"Charlet, D., Barras, C., Li\u00e9nard, J.S.: Impact of overlapping speech detection on speaker diarization for broadcast news and debates. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 7707\u20137711 (2013)","DOI":"10.1109\/ICASSP.2013.6639163"},{"key":"1_CR3","doi-asserted-by":"publisher","unstructured":"Elminshawi, M., Mack, W., Chakrabarty, S., Habets, E.: New insights on target speaker extraction (2022). https:\/\/doi.org\/10.48550\/arXiv.2202.00733","DOI":"10.48550\/arXiv.2202.00733"},{"key":"1_CR4","doi-asserted-by":"crossref","unstructured":"Gulati, A., et al.: Conformer: convolution-augmented transformer for speech recognition. In: Interspeech, pp. 5036\u20135040 (2020)","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"1_CR5","doi-asserted-by":"crossref","unstructured":"Huang, Z., Raj, D., Garc\u00eda, P., Khudanpur, S.: Adapting self-supervised models to multi-talker speech recognition using speaker embeddings. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135 (2023)","DOI":"10.1109\/ICASSP49357.2023.10097139"},{"key":"1_CR6","doi-asserted-by":"crossref","unstructured":"Kalda, J., Alum\u00e4e, T.: Collar-aware training for streaming speaker change detection in broadcast speech. In: The Speaker and Language Recognition Workshop (Odyssey), pp. 141\u2013147 (2022)","DOI":"10.21437\/Odyssey.2022-20"},{"key":"1_CR7","doi-asserted-by":"crossref","unstructured":"Kanda, N., et al.: Streaming multi-talker ASR with token-level serialized output training. In: Interspeech, pp. 3774\u20133778 (2022)","DOI":"10.21437\/Interspeech.2022-7"},{"key":"1_CR8","doi-asserted-by":"crossref","unstructured":"Koluguri, N.R., Park, T., Ginsburg, B.: TitaNet: neural model for speaker representation with 1D depth-wise separable convolutions and global context. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 8102\u20138106 (2022)","DOI":"10.1109\/ICASSP43922.2022.9746806"},{"key":"1_CR9","doi-asserted-by":"crossref","unstructured":"Kune\u0161ov\u00e1, M., Zaj\u00edc, Z., \u0160m\u00eddl, L., Karafi\u00e1t, M.: Comparison of wav2vec 2.0 models on three speech processing tasks. Int. J. Speech Technol. 27, 847\u2013859 (2024)","DOI":"10.1007\/s10772-024-10140-6"},{"key":"1_CR10","doi-asserted-by":"crossref","unstructured":"Lehe\u010dka, J., \u0160vec, J., Prazak, A., Psutka, J.: Exploring capabilities of monolingual audio transformers using large datasets in automatic speech recognition of Czech. In: Interspeech, pp. 1831\u20131835 (2022)","DOI":"10.21437\/Interspeech.2022-10439"},{"key":"1_CR11","unstructured":"Max Planck Institute for Psycholinguistics: ELAN (version 6.9) [computer software] (2024). https:\/\/archive.mpi.nl\/tla\/elan"},{"key":"1_CR12","doi-asserted-by":"crossref","unstructured":"Mu, Z., Yang, X., Zhu, W.: Multi-dimensional and multi-scale modeling for speech separation optimized by discriminative learning. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135 (2023)","DOI":"10.1109\/ICASSP49357.2023.10094612"},{"key":"1_CR13","unstructured":"Perez, E., Strub, F., Vries, H., Dumoulin, V., Courville, A.: FiLM: visual reasoning with a general conditioning layer. In: AAAI\u201918\/IAAI\u201918\/EAAI\u201918, pp. 3942\u20133951 (2018)"},{"key":"1_CR14","unstructured":"Yao, Z., et al.: Zipformer: a faster and better encoder for automatic speech recognition. In: The Twelfth International Conference on Learning Representations (ICLR) (2024)"},{"key":"1_CR15","doi-asserted-by":"crossref","unstructured":"Yu, D., Kolb\u00e6k, M., Tan, Z.H., Jensen, J.: Permutation invariant training of deep models for speaker-independent multi-talker speech separation. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 241\u2013245 (2017)","DOI":"10.1109\/ICASSP.2017.7952154"},{"key":"1_CR16","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Puvvada, K.C., Lavrukhin, V., Ginsburg, B.: Conformer-based target-speaker automatic speech recognition for single-channel audio. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135 (2023)","DOI":"10.1109\/ICASSP49357.2023.10095115"},{"issue":"3","key":"1_CR17","doi-asserted-by":"publisher","first-page":"8","DOI":"10.1109\/MSP.2023.3240008","volume":"40","author":"K Zmolikova","year":"2023","unstructured":"Zmolikova, K., Delcroix, M., Ochiai, T., Kinoshita, K., \u010cernock\u00fd, J., Yu, D.: Neural target speech extraction: an overview. IEEE Signal Process. Mag. 40(3), 8\u201329 (2023)","journal-title":"IEEE Signal Process. Mag."}],"container-title":["Lecture Notes in Computer Science","Text, Speech, and Dialogue"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-02548-7_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,9]],"date-time":"2025-09-09T18:04:53Z","timestamp":1757441093000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-02548-7_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,22]]},"ISBN":["9783032025470","9783032025487"],"references-count":17,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-02548-7_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025,8,22]]},"assertion":[{"value":"22 August 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"TSD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Text, Speech, and Dialogue","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Erlangen","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"tsd2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.kiv.zcu.cz\/tsd2025\/index.php","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}