{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T19:05:03Z","timestamp":1768071903521,"version":"3.49.0"},"reference-count":52,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"3","license":[{"start":{"date-parts":[[2020,3,1]],"date-time":"2020-03-01T00:00:00Z","timestamp":1583020800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,3,1]],"date-time":"2020-03-01T00:00:00Z","timestamp":1583020800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,3,1]],"date-time":"2020-03-01T00:00:00Z","timestamp":1583020800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Naver Corporation"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE J. Sel. Top. Signal Process."],"published-print":{"date-parts":[[2020,3]]},"DOI":"10.1109\/jstsp.2020.2987720","type":"journal-article","created":{"date-parts":[[2020,4,14]],"date-time":"2020-04-14T21:23:12Z","timestamp":1586899392000},"page":"568-576","source":"Crossref","is-referenced-by-count":15,"title":["Perfect Match: Self-Supervised Embeddings for Cross-Modal Retrieval"],"prefix":"10.1109","volume":"14","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6529-2196","authenticated-orcid":false,"given":"Soo-Whan","family":"Chung","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7741-7275","authenticated-orcid":false,"given":"Joon Son","family":"Chung","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6554-0783","authenticated-orcid":false,"given":"Hong-Goo","family":"Kang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref39","article-title":"An overview of automatic speaker recognition technology","author":"reynolds","year":"0","journal-title":"Proc IEEE Int Conf Acoust Speech Signal Process"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2018.02.001"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462665"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.202"},{"key":"ref31","article-title":"Look, listen and learn","author":"arandjelovi?","year":"0","journal-title":"Proc Int Conf Comput Vision"},{"key":"ref30","article-title":"Detecting audio-visual synchrony using deep neural networks","author":"marcheret","year":"0","journal-title":"Proc Annu Conf Int Speech Commun Assoc"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-54184-6_6"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.367"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.5244\/C.28.6"},{"key":"ref34","first-page":"1857","article-title":"Improved deep metric learning with multi-class n-pair loss objective","author":"sohn","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1155\/2007\/70186"},{"key":"ref27","article-title":"On learning associations of faces and voices","author":"kim","year":"2018"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2007.906583"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298958"},{"key":"ref1","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2014"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-54427-4_19"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00458"},{"key":"ref24","article-title":"Co-training of audio and video representations from self-supervised temporal synchronization","author":"korbar","year":"2018"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682524"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_5"},{"key":"ref25","article-title":"Audio-visual scene analysis with self-supervised multisensory features","author":"owens","year":"0","journal-title":"Proc IEEE Conf Comput Vision Pattern Recognit"},{"key":"ref50","article-title":"Deep face recognition","author":"parkhi","year":"0","journal-title":"Proc Brit Mach Vision Conf"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"ref52","article-title":"Understanding the difficulty of training deep feedforward neural networks","author":"glorot","year":"0","journal-title":"Proc Int Conf Artif Intell Statist"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.715"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.226"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2006.886017"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2018.00060"},{"key":"ref13","doi-asserted-by":"crossref","first-page":"504","DOI":"10.1126\/science.1127647","article-title":"Reducing the dimensionality of data with neural networks","volume":"313","author":"hinton","year":"2006","journal-title":"Science"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.278"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46487-9_40"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1873987"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995350"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2012.6467268"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247923"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6638947"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"ref5","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1142\/S012906571450004X"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-19309-5_55"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.89"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-950"},{"key":"ref47","article-title":"Unsupervised speaker adaptation based on the cosine similarity for text-independent speaker verification","author":"shum","year":"0","journal-title":"Proc Odyssey"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00879"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2009.5459197"},{"key":"ref44","article-title":"Disjoint mapping network for cross-modal matching of voices and faces","author":"wen","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019299"}],"container-title":["IEEE Journal of Selected Topics in Signal Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/4200690\/9126272\/09067055.pdf?arnumber=9067055","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,4,27]],"date-time":"2022-04-27T17:08:26Z","timestamp":1651079306000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9067055\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,3]]},"references-count":52,"journal-issue":{"issue":"3"},"URL":"https:\/\/doi.org\/10.1109\/jstsp.2020.2987720","relation":{},"ISSN":["1932-4553","1941-0484"],"issn-type":[{"value":"1932-4553","type":"print"},{"value":"1941-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,3]]}}}