{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:14:39Z","timestamp":1778080479157,"version":"3.51.4"},"reference-count":23,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,8,22]],"date-time":"2022-08-22T00:00:00Z","timestamp":1661126400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,8,22]],"date-time":"2022-08-22T00:00:00Z","timestamp":1661126400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100000001","name":"NATIONAL SCIENCE FOUNDATION","doi-asserted-by":"publisher","award":["1741472"],"award-info":[{"award-number":["1741472"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,8,22]]},"DOI":"10.1109\/mlsp55214.2022.9943352","type":"proceedings-article","created":{"date-parts":[[2022,11,17]],"date-time":"2022-11-17T20:39:35Z","timestamp":1668717575000},"page":"01-06","source":"Crossref","is-referenced-by-count":17,"title":["Rethinking Audio-Visual Synchronization for Active Speaker Detection"],"prefix":"10.1109","author":[{"given":"Abudukelimu","family":"Wuerkaixi","sequence":"first","affiliation":[{"name":"Institute for Artificial Intelligence, Tsinghua University (THUAI), Tsinghua University,State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist),Department of Automation,Beijing,P.R.China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"You","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Rochester,Department of Electrical and Computer Engineering,Rochester,NY,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhiyao","family":"Duan","sequence":"additional","affiliation":[{"name":"University of Rochester,Department of Electrical and Computer Engineering,Rochester,NY,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Changshui","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute for Artificial Intelligence, Tsinghua University (THUAI), Tsinghua University,State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist),Department of Automation,Beijing,P.R.China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-80"},{"key":"ref11","article-title":"Is someone speaking? exploring long-term temporal features for audio-visual active speaker detection","author":"tao","year":"0","journal-title":"Proc ACM Multimedia"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475275"},{"key":"ref13","article-title":"How to design a three-stage architecture for audio-visual active speaker detection in the wild","author":"k\u00f6p\u00fckl\u00fc","year":"0","journal-title":"Proc ICCV"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"ref15","first-page":"251","article-title":"Out of time: automated lip sync in the wild","author":"son chung","year":"0","journal-title":"Proc ACCV"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054376"},{"key":"ref17","article-title":"Audio-visual synchronisation in the wild","author":"chen","year":"0","journal-title":"Proc BMVC"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00139"},{"key":"ref19","article-title":"Speech driven talking face generation from a single image and an emotion condition","author":"emre eskimez","year":"2021","journal-title":"IEEE Transactions on Multimedia"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2000.871073"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201357"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053900"},{"key":"ref5","first-page":"3675","article-title":"Look who's talking: Active speaker detection in the wild","author":"jin kim","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/MLSP.2014.6958874"},{"key":"ref7","article-title":"ATSC implementation subcommittee finding: Relative timing of sound and vision for broadcast operations","volume":"26","author":"television","year":"2003","journal-title":"IS-191"},{"key":"ref2","article-title":"Spot the conversation: Speaker diarisation in the wild","author":"son chung","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref1","first-page":"1086","article-title":"VoxCeleb2: Deep speaker recognition","author":"son chung","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.29007\/l734"},{"key":"ref20","first-page":"2758","article-title":"Lip to speech synthesis with visual context attentional GAN","author":"kim","year":"0","journal-title":"Proc NeurIPS"},{"key":"ref22","article-title":"Attention is all you need","author":"vaswani","year":"0","journal-title":"Proc NeurIPS"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01428"}],"event":{"name":"2022 IEEE 32nd International Workshop on Machine Learning for Signal Processing (MLSP)","location":"Xi'an, China","start":{"date-parts":[[2022,8,22]]},"end":{"date-parts":[[2022,8,25]]}},"container-title":["2022 IEEE 32nd International Workshop on Machine Learning for Signal Processing (MLSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9943282\/9943306\/09943352.pdf?arnumber=9943352","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,12]],"date-time":"2022-12-12T20:00:04Z","timestamp":1670875204000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9943352\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,22]]},"references-count":23,"URL":"https:\/\/doi.org\/10.1109\/mlsp55214.2022.9943352","relation":{},"subject":[],"published":{"date-parts":[[2022,8,22]]}}}