{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:19:53Z","timestamp":1777655993417,"version":"3.51.4"},"publisher-location":"Cham","reference-count":44,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030208721","type":"print"},{"value":"9783030208738","type":"electronic"}],"license":[{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019]]},"DOI":"10.1007\/978-3-030-20873-8_18","type":"book-chapter","created":{"date-parts":[[2019,5,25]],"date-time":"2019-05-25T20:32:03Z","timestamp":1558816323000},"page":"276-292","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":44,"title":["On Learning Associations of Faces and Voices"],"prefix":"10.1007","author":[{"given":"Changil","family":"Kim","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hijung Valentina","family":"Shin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tae-Hyun","family":"Oh","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alexandre","family":"Kaspar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mohamed","family":"Elgharib","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wojciech","family":"Matusik","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2019,5,26]]},"reference":[{"key":"18_CR1","doi-asserted-by":"crossref","unstructured":"Arandjelovic, R., Zisserman, A.: Look, listen and learn. In: ICCV, pp. 609\u2013617 (2017)","DOI":"10.1109\/ICCV.2017.73"},{"key":"18_CR2","doi-asserted-by":"crossref","unstructured":"Aytar, Y., Vondrick, C., Torralba, A.: SoundNet: learning sound representations from unlabeled video. In: NIPS, pp. 892\u2013900 (2016)","DOI":"10.1109\/CVPR.2016.18"},{"key":"18_CR3","doi-asserted-by":"crossref","unstructured":"Bau, D., Zhou, B., Khosla, A., Oliva, A., Torralba, A.: Network dissection: quantifying interpretability of deep visual representations. In: CVPR, pp. 3319\u20133327 (2017)","DOI":"10.1109\/CVPR.2017.354"},{"issue":"1\u20132","key":"18_CR4","doi-asserted-by":"publisher","first-page":"75","DOI":"10.1002\/icd.249","volume":"10","author":"H Brookes","year":"2001","unstructured":"Brookes, H., Slater, A., Quinn, P.C., Lewkowicz, D.J., Hayes, R., Brown, E.: Three-month-old infants learn arbitrary auditory-visual pairings between voices and faces. Infant Child Dev. 10(1\u20132), 75\u201382 (2001)","journal-title":"Infant Child Dev."},{"issue":"12","key":"18_CR5","doi-asserted-by":"publisher","first-page":"535","DOI":"10.1016\/j.tics.2007.10.001","volume":"11","author":"S Campanella","year":"2007","unstructured":"Campanella, S., Belin, P.: Integrating face and voice in person perception. Trends Cogn. Sci. 11(12), 535\u2013543 (2007)","journal-title":"Trends Cogn. Sci."},{"key":"18_CR6","doi-asserted-by":"crossref","unstructured":"Chen, W., Chen, X., Zhang, J., Huang, K.: Beyond triplet loss: a deep quadruplet network for person re-identification. In: CVPR, pp. 1320\u20131329 (2017)","DOI":"10.1109\/CVPR.2017.145"},{"key":"18_CR7","unstructured":"Chopra, S., Hadsell, R., LeCun, Y.: Learning a similarity metric discriminatively, with application to face verification. In: CVPR, pp. 539\u2013546 (2005)"},{"key":"18_CR8","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Senior, A.W., Vinyals, O., Zisserman, A.: Lip reading sentences in the wild. In: CVPR, pp. 3444\u20133453 (2017)","DOI":"10.1109\/CVPR.2017.367"},{"key":"18_CR9","doi-asserted-by":"crossref","unstructured":"Doersch, C., Gupta, A., Efros, A.A.: Unsupervised visual representation learning by context prediction. In: ICCV, pp. 1422\u20131430 (2015)","DOI":"10.1109\/ICCV.2015.167"},{"issue":"1","key":"18_CR10","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1207\/s15326969eco0501_1","volume":"5","author":"WW Gaver","year":"1993","unstructured":"Gaver, W.W.: What in the world do we hear? an ecological approach to auditory event perception. Ecol. Psychol. 5(1), 1\u201329 (1993)","journal-title":"Ecol. Psychol."},{"key":"18_CR11","doi-asserted-by":"crossref","unstructured":"Gebru, I.D., Ba, S., Evangelidis, G., Horaud, R.: Tracking the active speaker based on a joint audio-visual observation model. In: ICCV Workshop, pp. 15\u201321 (2015)","DOI":"10.1109\/ICCVW.2015.96"},{"key":"18_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1007\/978-3-319-24261-3_7","volume-title":"Similarity-Based Pattern Recognition","author":"E Hoffer","year":"2015","unstructured":"Hoffer, E., Ailon, N.: Deep metric learning using triplet network. In: Feragen, A., Pelillo, M., Loog, M. (eds.) SIMBAD 2015. LNCS, vol. 9370, pp. 84\u201392. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24261-3_7"},{"key":"18_CR13","unstructured":"Hoover, K., Chaudhuri, S., Pantofaru, C., Slaney, M., Sturdy, I.: Putting a face to the voice: fusing audio and visual signals across a video to determine speakers. arXiv preprint arXiv:1706.00079 (2017)"},{"issue":"3","key":"18_CR14","doi-asserted-by":"publisher","first-page":"367","DOI":"10.1016\/j.cortex.2010.03.003","volume":"47","author":"F Joassin","year":"2011","unstructured":"Joassin, F., Pesenti, M., Maurage, P., Verreckt, E., Bruyer, R., Campanella, S.: Cross-modal interactions between human faces and voices involved in person recognition. Cortex 47(3), 367\u2013376 (2011)","journal-title":"Cortex"},{"issue":"3","key":"18_CR15","doi-asserted-by":"publisher","first-page":"241","DOI":"10.3758\/BF03203206","volume":"17","author":"B Jones","year":"1975","unstructured":"Jones, B., Kabanoff, B.: Eye movements in auditory space perception. Atten. Percept. Psychophys. 17(3), 241\u2013245 (1975)","journal-title":"Atten. Percept. Psychophys."},{"issue":"19","key":"18_CR16","doi-asserted-by":"publisher","first-page":"1709","DOI":"10.1016\/j.cub.2003.09.005","volume":"13","author":"M Kamachi","year":"2003","unstructured":"Kamachi, M., Hill, H., Lander, K., Vatikiotis-Bateson, E.: \u201cPutting the face to the voice\u201d: matching identity across modality. Curr. Biol. 13(19), 1709\u20131714 (2003)","journal-title":"Curr. Biol."},{"issue":"4","key":"18_CR17","doi-asserted-by":"publisher","first-page":"94:1","DOI":"10.1145\/3072959.3073658","volume":"36","author":"T Karras","year":"2017","unstructured":"Karras, T., Aila, T., Laine, S., Herva, A., Lehtinen, J.: Audio-driven facial animation by joint end-to-end learning of pose and emotion. ACM Trans. Graph. 36(4), 94:1\u201394:12 (2017)","journal-title":"ACM Trans. Graph."},{"key":"18_CR18","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"issue":"3","key":"18_CR19","doi-asserted-by":"publisher","first-page":"367","DOI":"10.1162\/0898929053279577","volume":"17","author":"K Kriegstein von","year":"2005","unstructured":"von Kriegstein, K., Kleinschmidt, A., Sterzer, P., Giraud, A.L.: Interaction of face and voice areas during speaker recognition. J. Cogn. Neurosci. 17(3), 367\u2013376 (2005)","journal-title":"J. Cogn. Neurosci."},{"issue":"3","key":"18_CR20","doi-asserted-by":"publisher","first-page":"159","DOI":"10.1207\/s15326969eco1603_1","volume":"16","author":"L Lachs","year":"2004","unstructured":"Lachs, L., Pisoni, D.B.: Crossmodal source identification in speech perception. Ecol. Psychol. 16(3), 159\u2013187 (2004)","journal-title":"Ecol. Psychol."},{"key":"18_CR21","doi-asserted-by":"crossref","unstructured":"Liu, Z., Luo, P., Wang, X., Tang, X.: Deep learning face attributes in the wild. In: ICCV, pp. 3730\u20133738 (2015)","DOI":"10.1109\/ICCV.2015.425"},{"key":"18_CR22","first-page":"2579","volume":"9","author":"L Maaten van der","year":"2008","unstructured":"van der Maaten, L., Hinton, G.: Visualizing data using t-SNE. JMLR 9, 2579\u20132605 (2008)","journal-title":"JMLR"},{"issue":"2","key":"18_CR23","doi-asserted-by":"publisher","first-page":"307","DOI":"10.1037\/a0030945","volume":"39","author":"LW Mavica","year":"2013","unstructured":"Mavica, L.W., Barenholtz, E.: Matching voice and face identity from static images. J. Exp. Psychol. Hum. Percept. Perform. 39(2), 307\u2013312 (2013)","journal-title":"J. Exp. Psychol. Hum. Percept. Perform."},{"issue":"5588","key":"18_CR24","doi-asserted-by":"publisher","first-page":"746","DOI":"10.1038\/264746a0","volume":"264","author":"H McGurk","year":"1976","unstructured":"McGurk, H., MacDonald, J.: Hearing lips and seeing voices. Nature 264(5588), 746\u2013748 (1976)","journal-title":"Nature"},{"key":"18_CR25","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Albanie, S., Zisserman, A.: Seeing voices and hearing faces: cross-modal biometric matching. In: CVPR, pp. 8427\u20138436 (2018)","DOI":"10.1109\/CVPR.2018.00879"},{"key":"18_CR26","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Chung, J.S., Zisserman, A.: VoxCeleb: a large-scale speaker identification dataset. In: INTERSPEECH, pp. 2616\u20132620 (2017)","DOI":"10.21437\/Interspeech.2017-950"},{"key":"18_CR27","unstructured":"Ngiam, J., Khosla, A., Kim, M., Nam, J., Lee, H., Ng, A.Y.: Multimodal deep learning. In: ICML, pp. 689\u2013696 (2011)"},{"key":"18_CR28","doi-asserted-by":"crossref","unstructured":"Owens, A., Isola, P., McDermott, J.H., Torralba, A., Adelson, E.H., Freeman, W.T.: Visually indicated sounds. In: CVPR, pp. 2405\u20132413 (2016)","DOI":"10.1109\/CVPR.2016.264"},{"key":"18_CR29","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"801","DOI":"10.1007\/978-3-319-46448-0_48","volume-title":"Computer Vision \u2013 ECCV 2016","author":"A Owens","year":"2016","unstructured":"Owens, A., Wu, J., McDermott, J.H., Freeman, W.T., Torralba, A.: Ambient sound provides supervision for visual learning. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 801\u2013816. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_48"},{"key":"18_CR30","doi-asserted-by":"crossref","unstructured":"Parkhi, O.M., Vedaldi, A., Zisserman, A.: Deep face recognition. In: BMVC, pp. 41.1\u201341.12 (2015)","DOI":"10.5244\/C.29.41"},{"key":"18_CR31","doi-asserted-by":"crossref","unstructured":"Senocak, A., Oh, T., Kim, J., Yang, M., Kweon, I.S.: Learning to localize sound source in visual scenes. arXiv preprint arXiv:1803.03849 (2018)","DOI":"10.1109\/CVPR.2018.00458"},{"issue":"6","key":"18_CR32","doi-asserted-by":"publisher","first-page":"589","DOI":"10.3758\/BF03198830","volume":"28","author":"BR Shelton","year":"1980","unstructured":"Shelton, B.R., Searle, C.L.: The influence of vision on the absolute identification of sound-source position. Percept. Psychophys. 28(6), 589\u2013596 (1980)","journal-title":"Percept. Psychophys."},{"key":"18_CR33","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"issue":"4","key":"18_CR34","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1073\/pnas.1008169108","volume":"108","author":"J Sliwa","year":"2011","unstructured":"Sliwa, J., Duhamel, J.R., Pascalis, O., Wirth, S.: Spontaneous voice-face identity matching by rhesus monkeys for familiar conspecifics and humans. PNAS 108(4), 1735\u20131740 (2011)","journal-title":"PNAS"},{"issue":"1","key":"18_CR35","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1177\/1474704916630317","volume":"14","author":"HM Smith","year":"2016","unstructured":"Smith, H.M., Dunn, A.K., Baguley, T., Stacey, P.C.: Concordant cues in faces and voices: testing the backup signal hypothesis. Evol. Psychol. 14(1), 1\u201310 (2016)","journal-title":"Evol. Psychol."},{"issue":"3","key":"18_CR36","doi-asserted-by":"publisher","first-page":"868","DOI":"10.3758\/s13414-015-1045-8","volume":"78","author":"HM Smith","year":"2016","unstructured":"Smith, H.M., Dunn, A.K., Baguley, T., Stacey, P.C.: Matching novel face and voice identity using static and dynamic facial images. Atten. Percept. Psychophys. 78(3), 868\u2013879 (2016)","journal-title":"Atten. Percept. Psychophys."},{"key":"18_CR37","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"900","DOI":"10.1007\/978-3-319-48881-3_59","volume-title":"Computer Vision \u2013 ECCV 2016 Workshops","author":"M Sol\u00e8r","year":"2016","unstructured":"Sol\u00e8r, M., Bazin, J.-C., Wang, O., Krause, A., Sorkine-Hornung, A.: Suggesting sounds for images from video collections. In: Hua, G., J\u00e9gou, H. (eds.) ECCV 2016. LNCS, vol. 9914, pp. 900\u2013917. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-48881-3_59"},{"issue":"4","key":"18_CR38","doi-asserted-by":"publisher","first-page":"95:1","DOI":"10.1145\/3072959.3073640","volume":"36","author":"S Suwajanakorn","year":"2017","unstructured":"Suwajanakorn, S., Seitz, S.M., Kemelmacher-Shlizerman, I.: Synthesizing Obama: learning lip sync from audio. ACM Trans. Graph. 36(4), 95:1\u201395:13 (2017)","journal-title":"ACM Trans. Graph."},{"issue":"4","key":"18_CR39","doi-asserted-by":"publisher","first-page":"93:1","DOI":"10.1145\/3072959.3073699","volume":"36","author":"SL Taylor","year":"2017","unstructured":"Taylor, S.L., et al.: A deep learning approach for generalized speech animation. ACM Trans. Graph. 36(4), 93:1\u201393:11 (2017)","journal-title":"ACM Trans. Graph."},{"key":"18_CR40","doi-asserted-by":"crossref","unstructured":"Torralba, A., Efros, A.A.: Unbiased look at dataset bias. In: CVPR, pp. 1521\u20131528 (2011)","DOI":"10.1109\/CVPR.2011.5995347"},{"key":"18_CR41","doi-asserted-by":"crossref","unstructured":"Tzeng, E., Hoffman, J., Saenko, K., Darrell, T.: Adversarial discriminative domain adaptation. In: CVPR, pp. 2962\u20132971 (2017)","DOI":"10.1109\/CVPR.2017.316"},{"key":"18_CR42","unstructured":"Vendrov, I., Kiros, R., Fidler, S., Urtasun, R.: Order-embeddings of images and language. arXiv preprint arXiv:1511.06361 (2015)"},{"key":"18_CR43","doi-asserted-by":"crossref","unstructured":"Wu, Z., Singh, B., Davis, L.S., Subrahmanian, V.S.: Deception detection in videos. In: AAAI (2018)","DOI":"10.1609\/aaai.v32i1.11502"},{"issue":"2","key":"18_CR44","doi-asserted-by":"publisher","first-page":"429","DOI":"10.3758\/s13423-014-0685-3","volume":"22","author":"LJ Zweig","year":"2015","unstructured":"Zweig, L.J., Suzuki, S., Grabowecky, M.: Learned face-voice pairings facilitate visual search. Psychon. Bull. Rev. 22(2), 429\u2013436 (2015)","journal-title":"Psychon. Bull. Rev."}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ACCV 2018"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-20873-8_18","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,9,18]],"date-time":"2022-09-18T17:17:13Z","timestamp":1663521433000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-030-20873-8_18"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019]]},"ISBN":["9783030208721","9783030208738"],"references-count":44,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-20873-8_18","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019]]},"assertion":[{"value":"26 May 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ACCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asian Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Perth, WA","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Australia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 December 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6 December 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"accv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/accv2018.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"Microsoft CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"979","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"274","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"2.7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}}]}}