{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:22:24Z","timestamp":1760314944975,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":35,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032079589","type":"print"},{"value":"9783032079596","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:00:00Z","timestamp":1760313600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:00:00Z","timestamp":1760313600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-07959-6_21","type":"book-chapter","created":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T09:22:27Z","timestamp":1760260947000},"page":"289-301","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Ensembling Synchronisation-Based and\u00a0Face-Voice Association Paradigms for\u00a0Robust Active Speaker Detection in\u00a0Egocentric Recordings"],"prefix":"10.1007","author":[{"given":"Jason","family":"Clarke","sequence":"first","affiliation":[]},{"given":"Yoshihiko","family":"Gotoh","sequence":"additional","affiliation":[]},{"given":"Stefan","family":"Goetze","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,10,13]]},"reference":[{"key":"21_CR1","doi-asserted-by":"crossref","unstructured":"Alcazar, J.L., Cordes, M., Zhao, C., Ghanem, B.: End-to-End active speaker detection. In: European Conference on Computer Vision (2022)","DOI":"10.1007\/978-3-031-19836-6_8"},{"key":"21_CR2","doi-asserted-by":"crossref","unstructured":"Alcazar, J.L., et al.: Active speakers in context. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.01248"},{"key":"21_CR3","unstructured":"Authors of this paper (author names redacted, will be added in final version of this paper): Face-voice association for audiovisual active speaker detection in egocentric recordings. In: Submitted to European Signal Processing Conference (EUSIPCO) (2025)"},{"key":"21_CR4","doi-asserted-by":"publisher","unstructured":"Bredin, H., et al.: pyannote.audio: neural building blocks for speaker diarization. In: ICASSP 2020, IEEE International Conference on Acoustics, Speech, and Signal Processing, pp. 7124\u20137128 (2020). https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9054260","DOI":"10.1109\/ICASSP40776.2020.9054260"},{"key":"21_CR5","doi-asserted-by":"publisher","unstructured":"Cao, Q., Shen, L., Xie, W., Parkhi, O.M., Zisserman, A.: Vggface2: a dataset for recognising faces across pose and age. In: 2018 13th IEEE International Conference on Automatic Face & Gesture Recognition (FG 2018), pp. 67\u201374. IEEE Press (2018). https:\/\/doi.org\/10.1109\/FG.2018.00020","DOI":"10.1109\/FG.2018.00020"},{"key":"21_CR6","doi-asserted-by":"crossref","unstructured":"Cartucho, J., Ventura, R., Veloso, M.: Robust object recognition through symbiotic deep learning in mobile robots. In: 2018 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS) (2018)","DOI":"10.1109\/IROS.2018.8594067"},{"key":"21_CR7","doi-asserted-by":"publisher","unstructured":"Chen, G., Zhang, D., Liu, T., Du, X.: Self-lifting: a novel framework for unsupervised voice-face association learning. In: Proceedings of the 2022 International Conference on Multimedia Retrieval, ICMR \u201922, pp. 527\u2013535. Association for Computing Machinery, New York (2022). https:\/\/doi.org\/10.1145\/3512527.3531364","DOI":"10.1145\/3512527.3531364"},{"key":"21_CR8","doi-asserted-by":"publisher","unstructured":"Chung, J.S., Nagrani, A., Zisserman, A.: Voxceleb2: deep speaker recognition. In: Interspeech 2018. ISCA (2018) https:\/\/doi.org\/10.21437\/Interspeech.2018-1929","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"21_CR9","doi-asserted-by":"publisher","unstructured":"Clarke, J., Gotoh, Y., Goetze, S.: Improving audiovisual active speaker detection in egocentric recordings with the data-efficient image transformer. In: IEEE Automatic Speech Recognition and Understanding Workshop (ASRU23) (2023). https:\/\/doi.org\/10.1109\/ASRU57964.2023.10389764","DOI":"10.1109\/ASRU57964.2023.10389764"},{"key":"21_CR10","doi-asserted-by":"crossref","unstructured":"Clarke, J., Gotoh, Y., Goetze, S.: Speaker embedding informed audiovisual active speaker detection for egocentric recordings. In: Proceedings of International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2025). https:\/\/arxiv.org\/abs\/2502.06012","DOI":"10.1109\/ICASSP49660.2025.10890414"},{"key":"21_CR11","doi-asserted-by":"publisher","unstructured":"Datta, G., Etchart, T., Yadav, V., Hedau, V., Natarajan, P., Chang, S.F.: ASD-transformer: efficient active speaker detection using self and multimodal transformers. In: Proceedings of IEEE International\u00a0Conference\u00a0on Acoustics, Speech and Signal Processing (ICASSP) (2022). https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9746991","DOI":"10.1109\/ICASSP43922.2022.9746991"},{"key":"21_CR12","doi-asserted-by":"publisher","unstructured":"Desplanques, B., Thienpondt, J., Demuynck, K.: ECAPA-TDNN: emphasized channel attention, propagation and aggregation in TDNN based speaker verification. In: Interspeech 2020. ISCA (2020). https:\/\/doi.org\/10.21437\/interspeech.2020-2650","DOI":"10.21437\/interspeech.2020-2650"},{"key":"21_CR13","unstructured":"Everingham, M., Van\u00a0Gool, L., Williams, C., Winn, J., Zisserman, A.: The PASCAL Visual Object Classes Challenge 2012 (VOC2012) Results (2012). http:\/\/www.pascal-network.org\/challenges\/VOC\/voc2012\/workshop\/index.html"},{"key":"21_CR14","doi-asserted-by":"crossref","unstructured":"Everingham, M., Sivic, J., Zisserman, A.: Hello! My name is... Buffy \u2013 automatic naming of characters in TV video. In: British Machine Vision Conference (2006)","DOI":"10.5244\/C.20.92"},{"key":"21_CR15","unstructured":"Grauman, K., et\u00a0al.: Ego4D: around the world in 3,000 hours of egocentric video. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021)"},{"key":"21_CR16","doi-asserted-by":"publisher","unstructured":"Hernandez-Ortega, J., Galbally, J., Fierrez, J., Haraksim, R., Beslay, L.: Faceqnet: quality assessment for face recognition based on deep learning. In: 2019 International Conference on Biometrics (ICB), pp.\u00a01\u20138 (2019). https:\/\/doi.org\/10.1109\/ICB45273.2019.8987255","DOI":"10.1109\/ICB45273.2019.8987255"},{"key":"21_CR17","doi-asserted-by":"publisher","unstructured":"Huh, J., et al.: Advancing active speaker detection for egocentric videos. In: ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135 (2025). https:\/\/doi.org\/10.1109\/ICASSP49660.2025.10888166","DOI":"10.1109\/ICASSP49660.2025.10888166"},{"key":"21_CR18","unstructured":"Ishibashi, T., Ono, K., Kugo, N., Sato, Y.: Technical Report for Ego4D Long Term Action Anticipation Challenge 2023 (2023). https:\/\/arxiv.org\/abs\/2307.01467"},{"key":"21_CR19","doi-asserted-by":"crossref","unstructured":"Jiang, Y., Tao, R., Pan, Z., Li, H.: Target active speaker detection with audio-visual cues. In: Proceedings of Interspeech (2023)","DOI":"10.21437\/Interspeech.2023-574"},{"key":"21_CR20","doi-asserted-by":"publisher","unstructured":"K\u00f6p\u00fckl\u00fc, O., Taseska, M., Rigoll, G.: How to design a three-stage architecture for audio-visual active speaker detection in the wild. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.00123","DOI":"10.1109\/ICCV48922.2021.00123"},{"key":"21_CR21","doi-asserted-by":"crossref","unstructured":"Le\u2019on-Alc\u2019azar, J., Heilbron, F.C., Thabet, A.K., Ghanem, B.: MAAS: multi-modal assignation for active speaker detection. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00033"},{"key":"21_CR22","doi-asserted-by":"crossref","unstructured":"Liao, J., Duan, H., Feng, K., Zhao, W., Yang, Y., Chen, L.: A light weight model for active speaker detection. In: Proceedings of\u00a0IEEE\/CVF Conference\u00a0on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.02196"},{"key":"21_CR23","doi-asserted-by":"publisher","unstructured":"Meng, Q., Zhao, S., Huang, Z., Zhou, F.: Magface: a universal representation for face recognition and quality assessment. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 14220\u201314229 (2021). https:\/\/doi.org\/10.1109\/CVPR46437.2021.01400","DOI":"10.1109\/CVPR46437.2021.01400"},{"key":"21_CR24","doi-asserted-by":"crossref","unstructured":"Min, K., Roy, S., Tripathi, S., Guha, T., Majumdar, S.: Learning long-term spatial-temporal graphs for active speaker detection. In: European\u00a0Conference\u00a0on Computer Vision (2022)","DOI":"10.1007\/978-3-031-19833-5_22"},{"key":"21_CR25","doi-asserted-by":"publisher","unstructured":"Ning, H., Zheng, X., Lu, X., Yuan, Y.: Disentangled representation learning for cross-modal biometric matching. IEEE Trans. Multimedia 24, 1763\u20131774 (2022). https:\/\/doi.org\/10.1109\/TMM.2021.3071243","DOI":"10.1109\/TMM.2021.3071243"},{"key":"21_CR26","first-page":"2825","volume":"12","author":"F Pedregosa","year":"2011","unstructured":"Pedregosa, F., et al.: Scikit-learn: machine learning in python. J. Mach. Learn. Res. 12, 2825\u20132830 (2011)","journal-title":"J. Mach. Learn. Res."},{"key":"21_CR27","doi-asserted-by":"publisher","unstructured":"Roth, J., et al.: Ava active speaker: an audio-visual dataset for active speaker detection. In: Proceedings of\u00a0International\u00a0Conference\u00a0on Acoustics, Speech and Signal Processing (ICASSP) (2020). https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053900","DOI":"10.1109\/ICASSP40776.2020.9053900"},{"key":"21_CR28","doi-asserted-by":"crossref","unstructured":"Saeed, M.S., et al.: Single-branch network for multimodal training. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10097207"},{"key":"21_CR29","doi-asserted-by":"crossref","unstructured":"Szegedy, C., et al.: Going deeper with convolutions. In: 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp.\u00a01\u20139 (2014). https:\/\/api.semanticscholar.org\/CorpusID:206592484","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"21_CR30","doi-asserted-by":"crossref","unstructured":"Tao, R., et al.: Is someone speaking? Exploring long-term temporal features for audio-visual active speaker detection. In: Proceedings of\u00a029th ACM International\u00a0Conference\u00a0on Multimedia (2021)","DOI":"10.1145\/3474085.3475587"},{"key":"21_CR31","doi-asserted-by":"publisher","unstructured":"Terh\u00f6rst, P., Kolf, J.N., Damer, N., Kirchbuchner, F., Kuijper, A.: Ser-fiq: unsupervised estimation of face image quality based on stochastic embedding robustness. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5650\u20135659 (2020). https:\/\/doi.org\/10.1109\/CVPR42600.2020.00569","DOI":"10.1109\/CVPR42600.2020.00569"},{"key":"21_CR32","unstructured":"Wang, J., Chen, G., Zheng, Y.D., Lu, T.: Exploring detection-based method for speaker diarization @ ego4d audio-only diarization challenge 2022 (2022). https:\/\/arxiv.org\/abs\/2211.08708"},{"key":"21_CR33","doi-asserted-by":"crossref","unstructured":"Wang, X., Cheng, F., Bertasius, G.: LoCoNet: long-short context network for active speaker detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.01747"},{"key":"21_CR34","doi-asserted-by":"crossref","unstructured":"Wen, P., Xu, Q., Jiang, Y., Yang, Z., He, Y., Huang, Q.: Seeking the shape of sound: an adaptive framework for learning voice-face association. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 16347\u201316356 (2021)","DOI":"10.1109\/CVPR46437.2021.01608"},{"issue":"10","key":"21_CR35","doi-asserted-by":"publisher","first-page":"1499","DOI":"10.1109\/LSP.2016.2603342","volume":"23","author":"K Zhang","year":"2016","unstructured":"Zhang, K., Zhang, Z., Li, Z., Qiao, Y.: Joint face detection and alignment using multitask cascaded convolutional networks. IEEE Signal Process. Lett. 23(10), 1499\u20131503 (2016). https:\/\/doi.org\/10.1109\/LSP.2016.2603342","journal-title":"IEEE Signal Process. Lett."}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-07959-6_21","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T09:22:35Z","timestamp":1760260955000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-07959-6_21"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,13]]},"ISBN":["9783032079589","9783032079596"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-07959-6_21","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,10,13]]},"assertion":[{"value":"13 October 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"SPECOM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Speech and Computer","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Szeged","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hungary","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"specom2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/specom.inf.u-szeged.hu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}