{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:32:53Z","timestamp":1776886373677,"version":"3.51.2"},"publisher-location":"Cham","reference-count":28,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032025470","type":"print"},{"value":"9783032025487","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:00Z","timestamp":1755820800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:00Z","timestamp":1755820800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-02548-7_12","type":"book-chapter","created":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T05:38:59Z","timestamp":1755754739000},"page":"133-145","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Combining Temporal Visual Dynamics and\u00a0Audio Representations for\u00a0Robust Speaker Identification"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-3657-1001","authenticated-orcid":false,"given":"Christopher","family":"Simic","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3582-2154","authenticated-orcid":false,"given":"Korbinian","family":"Riedhammer","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7780-8821","authenticated-orcid":false,"given":"Tobias","family":"Bocklet","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"12_CR1","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"284","DOI":"10.1007\/978-3-642-24958-7_33","volume-title":"Neural Information Processing","author":"N Asbai","year":"2011","unstructured":"Asbai, N., Amrouche, A., Debyeche, M.: Performances evaluation of GMM-UBM and GMM-SVM for speaker recognition in realistic world. In: Lu, B.-L., Zhang, L., Kwok, J. (eds.) ICONIP 2011. LNCS, vol. 7063, pp. 284\u2013291. Springer, Heidelberg (2011). https:\/\/doi.org\/10.1007\/978-3-642-24958-7_33"},{"key":"12_CR2","doi-asserted-by":"publisher","unstructured":"Chen, Z., Wang, S., Qian, Y.: Multi-modality matters: a performance leap on VoxCeleb. In: Proceedings of Interspeech 2020, pp. 2252\u20132256 (2020). https:\/\/doi.org\/10.21437\/Interspeech.2020-2229","DOI":"10.21437\/Interspeech.2020-2229"},{"key":"12_CR3","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Nagrani, A., Zisserman, A.: VoxCeleb2: deep speaker recognition. In: INTERSPEECH 2018 (2018)","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"12_CR4","doi-asserted-by":"publisher","unstructured":"Cumani, S., Brummer, N., Burget, L., Laface, P.: Fast discriminative speaker verification in the i-vector space. In: ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings, pp. 4852\u20134855 (2011). https:\/\/doi.org\/10.1109\/ICASSP.2011.5947442","DOI":"10.1109\/ICASSP.2011.5947442"},{"key":"12_CR5","doi-asserted-by":"crossref","unstructured":"Dan, J., et al.: TransFace: calibrating transformer training for face recognition from a data-centric perspective. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 20642\u201320653 (2023)","DOI":"10.1109\/ICCV51070.2023.01887"},{"issue":"4","key":"12_CR6","doi-asserted-by":"publisher","first-page":"788","DOI":"10.1109\/TASL.2010.2064307","volume":"19","author":"N Dehak","year":"2011","unstructured":"Dehak, N., Kenny, P.J., Dehak, R., Dumouchel, P., Ouellet, P.: Front-end factor analysis for speaker verification. IEEE Trans. Audio Speech Lang. Process. 19(4), 788\u2013798 (2011). https:\/\/doi.org\/10.1109\/TASL.2010.2064307","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"12_CR7","doi-asserted-by":"publisher","unstructured":"Deng, J., Guo, J., Xue, N., Zafeiriou, S.: ArcFace: additive angular margin loss for deep face recognition. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Long Beach, CA, USA, pp. 4685\u20134694 (2019). https:\/\/doi.org\/10.1109\/CVPR.2019.00482","DOI":"10.1109\/CVPR.2019.00482"},{"key":"12_CR8","doi-asserted-by":"publisher","unstructured":"Desplanques, B., Thienpondt, J., Demuynck, K.: ECAPA-TDNN: emphasized channel attention, propagation and aggregation in TDNN based speaker verification (2020). https:\/\/doi.org\/10.21437\/Interspeech.2020-2650","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"12_CR9","doi-asserted-by":"publisher","unstructured":"Garcia-Romero, D., Espy-Wilson, C.Y.: Analysis of i-vector length normalization in speaker recognition systems. In: Proceedings of Interspeech 2011, pp. 249\u2013252 (2011). https:\/\/doi.org\/10.21437\/Interspeech.2011-53","DOI":"10.21437\/Interspeech.2011-53"},{"key":"12_CR10","doi-asserted-by":"publisher","unstructured":"Ko, T., Peddintim V., Povey, D., Seltzer, M.l., Khudanpur, S.: A study on data augmentation of reverberant speech for robust speech recognition. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), New Orleans, LA, USA, pp. 5220\u20135224 (2017). https:\/\/doi.org\/10.1109\/ICASSP.2017.7953152","DOI":"10.1109\/ICASSP.2017.7953152"},{"key":"12_CR11","doi-asserted-by":"publisher","unstructured":"Liu, B., Qian, Y.: ECAPA++: Fine-grained deep embedding learning for TDNN based speaker verification. In: Proceedings of Interspeech 2023, pp. 3132\u20133136 (2023). https:\/\/doi.org\/10.21437\/Interspeech.2023-777","DOI":"10.21437\/Interspeech.2023-777"},{"key":"12_CR12","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Chung, J.S., Zisserman, A.: VoxCeleb: a large-scale speaker identification dataset. In: INTERSPEECH 2017 (2017)","DOI":"10.21437\/Interspeech.2017-950"},{"key":"12_CR13","doi-asserted-by":"publisher","unstructured":"Patil, A.A., Agarkar, B.S.: VGG FaceNet based sketch to face recognition with morphable model. In: IEEE Bombay Section Signature Conference (IBSSC). Mumbai, India 2022, pp. 1\u20135 (2022). https:\/\/doi.org\/10.1109\/IBSSC56953.2022.10037264","DOI":"10.1109\/IBSSC56953.2022.10037264"},{"key":"12_CR14","unstructured":"Ravanelli, M., et al.: SpeechBrain: a general-purpose speech toolkit. arXiv:2106.04624 [eess.AS] (2021)"},{"key":"12_CR15","doi-asserted-by":"publisher","first-page":"19","DOI":"10.1006\/dspr.1999.0361","volume":"10","author":"D Reynolds","year":"2000","unstructured":"Reynolds, D., Quatieri, T., Dunn, R.: Speaker verification using adapted Gaussian mixture models. Digital Signal Process. 10, 19\u201341 (2000). https:\/\/doi.org\/10.1006\/dspr.1999.0361","journal-title":"Digital Signal Process."},{"key":"12_CR16","doi-asserted-by":"publisher","unstructured":"Zheng, R., Zhang, S., Bo, X.: Text-independent speaker identification using GMM-UBM and frame level likelihood normalization. In: International Symposium on Chinese Spoken Language Processing. Hong Kong, China 2004, pp. 289\u2013292 (2004). https:\/\/doi.org\/10.1109\/CHINSL.2004.1409643","DOI":"10.1109\/CHINSL.2004.1409643"},{"key":"12_CR17","doi-asserted-by":"publisher","unstructured":"Sar\u0131, L., Singh, K., Zhou, J., Torresani, L., Singhal, N., Saraf, Y.: A multi-view approach to audio-visual speaker verification. In: ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Toronto, ON, Canada, pp. 6194\u20136198 (2021). https:\/\/doi.org\/10.1109\/ICASSP39728.2021.9414260","DOI":"10.1109\/ICASSP39728.2021.9414260"},{"key":"12_CR18","doi-asserted-by":"publisher","unstructured":"Schroff, F., Kalenichenko, D., Philbin, J.: Facenet: a unified embedding for face recognition and clustering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Boston, 7-12 June 2015, pp. 815\u2013823 (2015). https:\/\/doi.org\/10.1109\/CVPR.2015.7298682","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"12_CR19","doi-asserted-by":"publisher","unstructured":"Selvakumar, A., Fashandi, H.: Getting more for less: using weak labels and AV-Mixup for robust audio-visual speaker verification. In: Proceedings of Interspeech 2024, pp. 4728\u20134732 (2024). https:\/\/doi.org\/10.21437\/Interspeech.2024-53","DOI":"10.21437\/Interspeech.2024-53"},{"key":"12_CR20","doi-asserted-by":"publisher","unstructured":"Simic, C., Bocklet, T.: Self-supervised adaptive AV fusion module for pre-trained ASR models. In: ICASSP 2024 \u2013 IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 12787\u201312791 (2024). https:\/\/doi.org\/10.1109\/ICASSP48485.2024.10448047","DOI":"10.1109\/ICASSP48485.2024.10448047"},{"key":"12_CR21","unstructured":"Snyder, D., Chen, G., Povey, D.: MUSAN: a music, speech, and noise corpus. https:\/\/www.openslr.org\/17\/. Accessed 27 May 2025"},{"key":"12_CR22","doi-asserted-by":"publisher","unstructured":"Snyder, D., Garcia-Romero, D., Sell, G., Povey, D., Khudanpur, S.: X-vectors: robust DNN embeddings for speaker recognition. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Calgary, AB, Canada, pp. 5329\u2013533 (2018). https:\/\/doi.org\/10.1109\/ICASSP.2018.8461375","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"12_CR23","doi-asserted-by":"publisher","unstructured":"Snyder, D., Garcia-Romero, D., Sell, G., McCree, A., Povey, D., Khudanpur, S.: Speaker recognition for multi-speaker conversations using X-vectors. In: ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Brighton, UK, pp. 5796\u20135800 (2019). https:\/\/doi.org\/10.1109\/ICASSP.2019.8683760","DOI":"10.1109\/ICASSP.2019.8683760"},{"key":"12_CR24","doi-asserted-by":"publisher","unstructured":"Taigman, Y., Yang, M., Ranzato, M., Wolf, L.: DeepFace: closing the gap to human-level performance in face verification. In: 2014 IEEE Conference on Computer Vision and Pattern Recognition, Columbus, OH, USA, pp. 1701\u20131708 (2014). https:\/\/doi.org\/10.1109\/CVPR.2014.220","DOI":"10.1109\/CVPR.2014.220"},{"key":"12_CR25","doi-asserted-by":"publisher","unstructured":"Tao, R., Lee, K.A., Shi, Z., Li, H.: speaker recognition with two-step multi-modal deep cleansing. In: ICASSP 2023 \u2013 IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 1\u20135 (2023). https:\/\/doi.org\/10.1109\/ICASSP49357.2023.10096814","DOI":"10.1109\/ICASSP49357.2023.10096814"},{"key":"12_CR26","doi-asserted-by":"publisher","unstructured":"Yao, J., Liang, C., Peng, Z., Zhang, B., Zhang, X.-L.: Branch-ECAPA-TDNN: a parallel branch architecture to capture local and global features for speaker verification. In: Proceedings of Interspeech 2023, pp. 1943\u20131947 (2023). https:\/\/doi.org\/10.21437\/Interspeech.2023-402","DOI":"10.21437\/Interspeech.2023-402"},{"key":"12_CR27","unstructured":"Sun, Z., Tzimiropoulos, G.: Part-based face recognition with vision transformers. In: Proceedings of the 33rd British Machine Vision Conference (BMVC 2022), London, UK, November 21\u201324, 2022. BMVA Press (2022)"},{"key":"12_CR28","doi-asserted-by":"publisher","unstructured":"Gnana Praveen, R., Alam, J.: Cross-modal transformers for audio-visual person verification. In: Proceedings of the Speaker and Language Recognition Workshop (Odyssey 2024), pp. 240\u2013246 (2024). https:\/\/doi.org\/10.21437\/odyssey.2024-34","DOI":"10.21437\/odyssey.2024-34"}],"container-title":["Lecture Notes in Computer Science","Text, Speech, and Dialogue"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-02548-7_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T05:39:05Z","timestamp":1755754745000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-02548-7_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,22]]},"ISBN":["9783032025470","9783032025487"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-02548-7_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,8,22]]},"assertion":[{"value":"22 August 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"TSD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Text, Speech, and Dialogue","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Erlangen","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"tsd2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.kiv.zcu.cz\/tsd2025\/index.php","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}