{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:16:01Z","timestamp":1778080561849,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the National Key R&D Program of China","award":["2017YFA0700804"],"award-info":[{"award-number":["2017YFA0700804"]}]},{"name":"the National Natural Science Foundation of China","award":["61876171"],"award-info":[{"award-number":["61876171"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475275","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T17:45:27Z","timestamp":1634579127000},"page":"3964-3972","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":39,"title":["UniCon: Unified Context Network for Robust Active Speaker Detection"],"prefix":"10.1145","author":[{"given":"Yuanhang","family":"Zhang","sequence":"first","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"}]},{"given":"Susan","family":"Liang","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"}]},{"given":"Shuang","family":"Yang","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"}]},{"given":"Xiao","family":"Liu","sequence":"additional","affiliation":[{"name":"Tomorrow Advancing Life, Beijing, China"}]},{"given":"Zhongqin","family":"Wu","sequence":"additional","affiliation":[{"name":"Tomorrow Advancing Life, Beijing, China"}]},{"given":"Shiguang","family":"Shan","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"}]},{"given":"Xilin","family":"Chen","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Proceedings, Part XVIII (Lecture Notes in Computer Science","volume":"224","author":"Afouras Triantafyllos","year":"2020"},{"key":"e_1_3_2_2_2_1","volume-title":"Pablo Arbel\u00e1 ez, and Bernard Ghanem","author":"Juan Le\u00f3","year":"2020"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/1553374.1553380"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"crossref","unstructured":"C. Beyan M. Shahid and V. Murino. 2020. RealVAD: A Real-world Dataset and A Method for Voice Activity Detection by Body Motion Analysis. IEEE Transactions on Multimedia (2020) 1--1. https:\/\/doi.org\/10.1109\/TMM.2020.3007350  C. Beyan M. Shahid and V. Murino. 2020. RealVAD: A Real-world Dataset and A Method for Voice Activity Detection by Body Motion Analysis. IEEE Transactions on Multimedia (2020) 1--1. https:\/\/doi.org\/10.1109\/TMM.2020.3007350","DOI":"10.1109\/TMM.2020.3007350"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2967202"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/HUMANOIDS.2013.7029977"},{"key":"e_1_3_2_2_7_1","volume-title":"ECCV (5) (Lecture Notes in Computer Science","author":"Chakravarty Punarjay"},{"key":"e_1_3_2_2_8_1","volume-title":"Naver at ActivityNet Challenge 2019 - Task B Active Speaker Detection (AVA). CoRR","author":"Chung Joon Son","year":"2019"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"crossref","unstructured":"Joon Son Chung Jaesung Huh Arsha Nagrani Triantafyllos Afouras and Andrew Zisserman. 2020. Spot the Conversation: Speaker Diarisation in the Wild. In INTERSPEECH. ISCA 299--303.  Joon Son Chung Jaesung Huh Arsha Nagrani Triantafyllos Afouras and Andrew Zisserman. 2020. Spot the Conversation: Speaker Diarisation in the Wild. In INTERSPEECH. ISCA 299--303.","DOI":"10.21437\/Interspeech.2020-2337"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"crossref","unstructured":"Joon Son Chung Arsha Nagrani and Andrew Zisserman. 2018. VoxCeleb2: Deep Speaker Recognition. In INTERSPEECH. ISCA 1086--1090.  Joon Son Chung Arsha Nagrani and Andrew Zisserman. 2018. VoxCeleb2: Deep Speaker Recognition. In INTERSPEECH. ISCA 1086--1090.","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"e_1_3_2_2_11_1","volume-title":"Out of Time: Automated Lip Sync in the Wild. In ACCV Workshops (2) (Lecture Notes in Computer Science","volume":"263","author":"Chung Joon Son","year":"2016"},{"key":"e_1_3_2_2_12_1","volume-title":"Joon Son Chung, and Hong-Goo Kang","author":"Chung Soo-Whan","year":"2019"},{"key":"e_1_3_2_2_13_1","volume-title":"IEEE International Conference on Multimedia and Expo (III). IEEE Computer Society, 1589--1592","author":"Cutler Ross"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"crossref","volume-title":"Multimodal Active Speaker Detection and Virtual Cinematography for Video Conferencing","author":"Cutler Ross","DOI":"10.1109\/ICASSP40776.2020.9053171"},{"key":"e_1_3_2_2_15_1","volume-title":"RetinaFace: Single-Shot Multi-Level Face Localisation in the Wild","author":"Deng Jiankang"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9412681"},{"key":"e_1_3_2_2_17_1","volume-title":"Self-Supervised Learning for Audio-Visual Speaker Diarization","author":"Ding Yifan"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201357"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"crossref","unstructured":"Y. Fan J. W. Kang L. T. Li K. C. Li H. L. Chen S. T. Cheng P. Y. Zhang Z. Y. Zhou Y. Q. Cai and D. Wang. 2020. CN-Celeb: A Challenging Chinese Speaker Recognition Dataset. In ICASSP. IEEE 7604--7608.  Y. Fan J. W. Kang L. T. Li K. C. Li H. L. Chen S. T. Cheng P. Y. Zhang Z. Y. Zhou Y. Q. Cai and D. Wang. 2020. CN-Celeb: A Challenging Chinese Speaker Recognition Dataset. In ICASSP. IEEE 7604--7608.","DOI":"10.1109\/ICASSP40776.2020.9054017"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/1631272.1631387"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"crossref","unstructured":"Giulia Garau Alfred Dielmann and Herv\u00e9 Bourlard. 2010. Audio-visual synchronisation for speaker diarisation. In INTERSPEECH. ISCA 2654--2657.  Giulia Garau Alfred Dielmann and Herv\u00e9 Bourlard. 2010. Audio-visual synchronisation for speaker diarisation. In INTERSPEECH. ISCA 2654--2657.","DOI":"10.21437\/Interspeech.2010-704"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2648793"},{"key":"e_1_3_2_2_23_1","volume-title":"Video Action Transformer Network. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2019","author":"Girdhar Rohit","year":"2019"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.123"},{"key":"e_1_3_2_2_25_1","volume-title":"Deep Residual Learning for Image Recognition","author":"He Kaiming"},{"key":"e_1_3_2_2_26_1","volume-title":"Using audio-visual information to understand speaker activity: Tracking active speakers on and off screen","author":"Hoover Ken"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00483"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.5555\/3045118.3045167"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2010-571"},{"key":"e_1_3_2_2_30_1","volume-title":"Some functions of gaze-direction in social interaction. Acta psychologica","author":"Kendon Adam","year":"1967"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1142\/S021946780100027X"},{"key":"e_1_3_2_2_32_1","volume-title":"Decoupled Weight Decay Regularization. In 7th International Conference on Learning Representations, ICLR 2019","author":"Loshchilov Ilya","year":"2019"},{"key":"e_1_3_2_2_33_1","volume-title":"Vicky Kalogeiton, Pablo Medina-Suarez, and Andrew Zisserman.","author":"Manuel","year":"2019"},{"key":"e_1_3_2_2_34_1","volume-title":"Actor-Context-Actor Relation Network for Spatio-Temporal Action Localization. CoRR","author":"Pan Junting","year":"2020"},{"key":"e_1_3_2_2_35_1","volume-title":"AVA-ActiveSpeaker: An Audio-Visual Dataset for Active Speaker Detection","author":"Roth Joseph"},{"key":"e_1_3_2_2_36_1","volume-title":"FaceNet: A unified embedding for face recognition and clustering","author":"Schroff Florian"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00159"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00238"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.5555\/3008751.3008865"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_2_41_1","volume-title":"ECCV (25) (Lecture Notes in Computer Science","author":"Wu Jianchao"},{"key":"e_1_3_2_2_42_1","first-page":"1","article-title":"Multi-Task Learning for Audio-Visual Active Speaker Detection","volume":"2019","author":"Zhang Yuan-Hang","year":"2019","journal-title":"The ActivityNet Large-Scale Activity Recognition Challenge"}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475275","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475275","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:17Z","timestamp":1750193297000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475275"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":42,"alternative-id":["10.1145\/3474085.3475275","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475275","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}