{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T02:35:18Z","timestamp":1778812518704,"version":"3.51.4"},"publisher-location":"Cham","reference-count":17,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030863616","type":"print"},{"value":"9783030863623","type":"electronic"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-86362-3_36","type":"book-chapter","created":{"date-parts":[[2021,9,11]],"date-time":"2021-09-11T11:02:35Z","timestamp":1631358155000},"page":"439-450","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["FaVoA: Face-Voice Association Favours Ambiguous Speaker Detection"],"prefix":"10.1007","author":[{"given":"Hugo","family":"Carneiro","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cornelius","family":"Weber","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Stefan","family":"Wermter","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2021,9,7]]},"reference":[{"key":"36_CR1","doi-asserted-by":"crossref","unstructured":"Alc\u00e1zar, J.L., et al.: Active speakers in context. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.01248"},{"key":"36_CR2","unstructured":"Arevalo, J., Solorio, T., Montes-y-G\u00f3mez, M., Gonz\u00e1lez, F.A.: Gated multimodal units for information fusion. In: 5th International Conference on Learning Representations, ICLR 2017, Workshop Track Proceedings (2017). OpenReview.net"},{"issue":"3","key":"36_CR3","doi-asserted-by":"publisher","first-page":"541","DOI":"10.1037\/0012-1649.41.3.541","volume":"41","author":"LE Bahrick","year":"2005","unstructured":"Bahrick, L.E., Hernandez-Reif, M., Flom, R.: The development of infant learning about specific face-voice relations. Dev. Psychol. 41(3), 541\u2013552 (2005)","journal-title":"Dev. Psychol."},{"key":"36_CR4","doi-asserted-by":"crossref","unstructured":"Cho, K., et al.: Learning phrase representations using RNN encoder-decoder for statistical machine translation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), Doha, Qatar, pp. 1724\u20131734. Association for Computational Linguistics (2014)","DOI":"10.3115\/v1\/D14-1179"},{"key":"36_CR5","unstructured":"Choi, H.S., Park, C., Lee, K.: From inference to generation: end-to-end fully self-supervised generation of human face from speech. In: 8th International Conference on Learning Representations, ICLR 2020, Addis Ababa, Ethiopia, 26\u201330 April 2020 (2020). OpenReview.net"},{"key":"36_CR6","unstructured":"Chung, J.S.: Naver at Activitynet Challenge 2019 - Task B Active Speaker Detection (AVA) (2019). https:\/\/research.google.com\/ava\/2019\/Naver_Corporation.pdf"},{"key":"36_CR7","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Huh, J., Nagrani, A., Afouras, T., Zisserman, A.: Spot the conversation: speaker diarisation in the wild. In: Interspeech 2020, 21st Annual Conference of the International Speech Communication Association, pp. 299\u2013303. ISCA (2020)","DOI":"10.21437\/Interspeech.2020-2337"},{"key":"36_CR8","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1207\/s15326969eco0501_1","volume":"5","author":"WW Gaver","year":"1993","unstructured":"Gaver, W.W.: What in the world do we hear? An ecological approach to auditory event perception. Ecol. Psychol. 5, 1\u201329 (1993)","journal-title":"Ecol. Psychol."},{"issue":"2","key":"36_CR9","doi-asserted-by":"publisher","first-page":"117","DOI":"10.1109\/TETCI.2017.2784878","volume":"2","author":"J Hou","year":"2018","unstructured":"Hou, J., Wang, S., Lai, Y., Tsao, Y., Chang, H., Wang, H.: Audio-visual speech enhancement using multimodal deep convolutional neural networks. IEEE Trans. Emerging Top. Comput. Intell. 2(2), 117\u2013128 (2018)","journal-title":"IEEE Trans. Emerging Top. Comput. Intell."},{"key":"36_CR10","doi-asserted-by":"crossref","unstructured":"Huang, C., Koishida, K.: Improved active speaker detection based on optical flow. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops (2020)","DOI":"10.1109\/CVPRW50498.2020.00483"},{"key":"36_CR11","doi-asserted-by":"crossref","unstructured":"Kim, C., Shin, H.V., Oh, T.H., Kaspar, A., Elgharib, M., Matusik, W.: On learning associations of faces and voices. In: Proceedings of Asian Conference on Computer Vision (ACCV) (2018)","DOI":"10.1007\/978-3-030-20873-8_18"},{"key":"36_CR12","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Albanie, S., Zisserman, A.: Learnable PINs: cross-modal embeddings for person identity. In: European Conference on Computer Vision (2018)","DOI":"10.1007\/978-3-030-01261-8_5"},{"key":"36_CR13","doi-asserted-by":"crossref","unstructured":"Oh, T., et al.: Speech2Face: learning the face behind a voice. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 7531\u20137540 (2019)","DOI":"10.1109\/CVPR.2019.00772"},{"key":"36_CR14","doi-asserted-by":"crossref","unstructured":"Qu, L., Weber, C., Wermter, S.: LipSound: Neural mel-spectrogram reconstruction for lip reading. In: Interspeech 2019, 20th Annual Conference of the International Speech Communication Association, pp. 2768\u20132772. ISCA (2019)","DOI":"10.21437\/Interspeech.2019-1393"},{"key":"36_CR15","doi-asserted-by":"crossref","unstructured":"Qu, L., Weber, C., Wermter, S.: Multimodal target speech separation with voice and face references. In: Interspeech 2020, 21st Annual Conference of the International Speech Communication Association, pp. 1416\u20131420. ISCA (2020)","DOI":"10.21437\/Interspeech.2020-1697"},{"key":"36_CR16","doi-asserted-by":"crossref","unstructured":"Roth, J., et al.: AVA-ActiveSpeaker: an audio-visual dataset for active speaker detection. In: ICASSP 2020\u20132020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4492\u20134496 (2020)","DOI":"10.1109\/ICASSP40776.2020.9053900"},{"key":"36_CR17","unstructured":"Zhang, Y.H., Xiao, J., Yang, S., Shan, S.: Multi-task learning for audio-visual active speaker detection (2019). https:\/\/research.google.com\/ava\/2019\/Multi_Task_Learning_for_Audio_Visual_Active_Speaker_Detection.pdf"}],"container-title":["Lecture Notes in Computer Science","Artificial Neural Networks and Machine Learning \u2013 ICANN 2021"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-86362-3_36","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,8]],"date-time":"2024-09-08T06:10:28Z","timestamp":1725775828000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-86362-3_36"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030863616","9783030863623"],"references-count":17,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-86362-3_36","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"7 September 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICANN","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Artificial Neural Networks","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Bratislava","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Slovakia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17 September 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icann2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/e-nns.org\/icann2021\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"OCS","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"496","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"265","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"53% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.5","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Conference was held online due to the COVID-19 pandemic.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}