{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T04:46:38Z","timestamp":1750308398740,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":29,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T00:00:00Z","timestamp":1634515200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["IIS ? 1925083"],"award-info":[{"award-number":["IIS ? 1925083"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,18]]},"DOI":"10.1145\/3461615.3485430","type":"proceedings-article","created":{"date-parts":[[2021,12,18]],"date-time":"2021-12-18T04:57:40Z","timestamp":1639803460000},"page":"37-42","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Group-Level Focus of Visual Attention for Improved Active Speaker Detection"],"prefix":"10.1145","author":[{"given":"Christopher","family":"Birmingham","sequence":"first","affiliation":[{"name":"Computer Science\/Interaction Lab, University of Southern California, United States"}]},{"given":"Maja","family":"Mataric","sequence":"additional","affiliation":[{"name":"University of Southern California, United States"}]},{"given":"Kalin","family":"Stefanov","sequence":"additional","affiliation":[{"name":"Monash University, Australia"}]}],"member":"320","published-online":{"date-parts":[[2021,12,17]]},"reference":[{"volume-title":"Proceedings of the International Conference on Machine Learning and Applications, Vol.\u00a02. 525\u2013528","author":"Ahmad R.","unstructured":"R. Ahmad , S.\u00a0 P. Raza , and H. Malik . 2013. Visual Speech Detection Using an Unsupervised Learning Framework . In Proceedings of the International Conference on Machine Learning and Applications, Vol.\u00a02. 525\u2013528 . R. Ahmad, S.\u00a0P. Raza, and H. Malik. 2013. Visual Speech Detection Using an Unsupervised Learning Framework. In Proceedings of the International Conference on Machine Learning and Applications, Vol.\u00a02. 525\u2013528.","key":"e_1_3_2_1_1_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_2_1","DOI":"10.1109\/TASL.2011.2125954"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_3_1","DOI":"10.1109\/WACV.2016.7477553"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_4_1","DOI":"10.1186\/1743-0003-5-11"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_5_1","DOI":"10.1109\/ICRA40945.2020.9196875"},{"volume-title":"Proceedings of the ACM on International Conference on Multimodal Interaction. 87\u201390","author":"Chakravarty P.","unstructured":"P. Chakravarty , S. Mirzaei , T. Tuytelaars , and H. Van\u00a0Hamme . 2015. Who\u2019s Speaking? Audio-Supervised Classification of Active Speakers in Video . In Proceedings of the ACM on International Conference on Multimodal Interaction. 87\u201390 . P. Chakravarty, S. Mirzaei, T. Tuytelaars, and H. Van\u00a0Hamme. 2015. Who\u2019s Speaking? Audio-Supervised Classification of Active Speakers in Video. In Proceedings of the ACM on International Conference on Multimodal Interaction. 87\u201390.","key":"e_1_3_2_1_6_1"},{"volume-title":"Proceedings of the European Conference on Computer Vision. 285\u2013301","author":"Chakravarty P.","unstructured":"P. Chakravarty and T. Tuytelaars . 2016. Cross-Modal Supervision for Learning Active Speaker Detection in Video . In Proceedings of the European Conference on Computer Vision. 285\u2013301 . P. Chakravarty and T. Tuytelaars. 2016. Cross-Modal Supervision for Learning Active Speaker Detection in Video. In Proceedings of the European Conference on Computer Vision. 285\u2013301.","key":"e_1_3_2_1_7_1"},{"unstructured":"J.\u00a0S. Chung. 2019. Naver at ActivityNet Challenge 2019 \u2013 Task B Active Speaker Detection (AVA). arXiv preprint arXiv:1906.10555(2019).  J.\u00a0S. Chung. 2019. Naver at ActivityNet Challenge 2019 \u2013 Task B Active Speaker Detection (AVA). arXiv preprint arXiv:1906.10555(2019).","key":"e_1_3_2_1_8_1"},{"volume-title":"Proceedings of the Workshop on Multi-view Lip-reading.","author":"Chung S.","unstructured":"J.\u00a0 S. Chung and A. Zisserman . 2016. Out of Time: Automated Lip Sync in the Wild . In Proceedings of the Workshop on Multi-view Lip-reading. J.\u00a0S. Chung and A. Zisserman. 2016. Out of Time: Automated Lip Sync in the Wild. In Proceedings of the Workshop on Multi-view Lip-reading.","key":"e_1_3_2_1_9_1"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing. 3965\u20133969","author":"Chung S.-W.","year":"2019","unstructured":"S.-W. Chung , J.\u00a0 S. Chung , and H.-G. Kang . 2019 . PerfectMatch: Improved Cross-modal Embeddings for Audio-visual Synchronisation . In Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing. 3965\u20133969 . S.-W. Chung, J.\u00a0S. Chung, and H.-G. Kang. 2019. PerfectMatch: Improved Cross-modal Embeddings for Audio-visual Synchronisation. In Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing. 3965\u20133969."},{"volume-title":"Proceedings of the IEEE International Conference on Multimedia and Expo, Vol.\u00a03. 1589\u20131592","author":"Cutler R.","unstructured":"R. Cutler and L. Davis . 2000. Look Who\u2019s Talking: Speaker Detection Using Video and Audio Correlation . In Proceedings of the IEEE International Conference on Multimedia and Expo, Vol.\u00a03. 1589\u20131592 . R. Cutler and L. Davis. 2000. Look Who\u2019s Talking: Speaker Detection Using Video and Audio Correlation. In Proceedings of the IEEE International Conference on Multimedia and Expo, Vol.\u00a03. 1589\u20131592.","key":"e_1_3_2_1_11_1"},{"key":"e_1_3_2_1_12_1","volume-title":"Frame Analysis: An Essay on the Organization of Experience","author":"Goffman E.","year":"1974","unstructured":"E. Goffman . 1974 . Frame Analysis: An Essay on the Organization of Experience . Harvard University Press . E. Goffman. 1974. Frame Analysis: An Essay on the Organization of Experience. Harvard University Press."},{"volume-title":"Forms of Talk","author":"Goffman E.","unstructured":"E. Goffman . 1981. Forms of Talk . University of Pennsylvania Press. E. Goffman. 1981. Forms of Talk. University of Pennsylvania Press.","key":"e_1_3_2_1_13_1"},{"volume-title":"Proceedings of the ACM International Conference on Multimedia. 1107\u20131110","author":"Hu Y.","unstructured":"Y. Hu , J. Ren , J. Dai , C. Yuan , L. Xu , and W. Wang . 2015. Deep Multimodal Speaker Naming . In Proceedings of the ACM International Conference on Multimedia. 1107\u20131110 . Y. Hu, J. Ren, J. Dai, C. Yuan, L. Xu, and W. Wang. 2015. Deep Multimodal Speaker Naming. In Proceedings of the ACM International Conference on Multimedia. 1107\u20131110.","key":"e_1_3_2_1_14_1"},{"key":"e_1_3_2_1_15_1","volume-title":"Speech\/Non-Speech Detection in Meetings From Automatically Extracted Low Resolution Visual Features. Technical Report. Idiap.","author":"Hung H.","year":"2009","unstructured":"H. Hung and S.\u00a0 O. Ba . 2009 . Speech\/Non-Speech Detection in Meetings From Automatically Extracted Low Resolution Visual Features. Technical Report. Idiap. H. Hung and S.\u00a0O. Ba. 2009. Speech\/Non-Speech Detection in Meetings From Automatically Extracted Low Resolution Visual Features. Technical Report. Idiap."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_16_1","DOI":"10.1109\/TMM.2014.2305632"},{"volume-title":"Proceedings of the Annual Conference of the International Speech Communication Association.","author":"Nagrani A.","unstructured":"A. Nagrani , J.\u00a0 S. Chung , and A. Zisserman . 2017. VoxCeleb: A Large-Scale Speaker Identification Dataset . In Proceedings of the Annual Conference of the International Speech Communication Association. A. Nagrani, J.\u00a0S. Chung, and A. Zisserman. 2017. VoxCeleb: A Large-Scale Speaker Identification Dataset. In Proceedings of the Annual Conference of the International Speech Communication Association.","key":"e_1_3_2_1_17_1"},{"volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence. 3581\u20133587","author":"Ren J.","unstructured":"J. Ren , Y. Hu , Y.-W. Tai , C. Wang , L. Xu , W. Sun , and Q. Yan . 2016. Look, Listen and Learn - A Multimodal LSTM for Speaker Identification . In Proceedings of the AAAI Conference on Artificial Intelligence. 3581\u20133587 . J. Ren, Y. Hu, Y.-W. Tai, C. Wang, L. Xu, W. Sun, and Q. Yan. 2016. Look, Listen and Learn - A Multimodal LSTM for Speaker Identification. In Proceedings of the AAAI Conference on Artificial Intelligence. 3581\u20133587.","key":"e_1_3_2_1_18_1"},{"doi-asserted-by":"crossref","unstructured":"J. Roth S. Chaudhuri O. Klejch R. Marvin A. Gallagher L. Kaver S. Ramaswamy A. Stopczynski C. Schmid Z. Xi and C. Pantofaru. 2019. AVA-ActiveSpeaker: An Audio-Visual Dataset for Active Speaker Detection. arXiv preprint arXiv:1901.01342(2019).  J. Roth S. Chaudhuri O. Klejch R. Marvin A. Gallagher L. Kaver S. Ramaswamy A. Stopczynski C. Schmid Z. Xi and C. Pantofaru. 2019. AVA-ActiveSpeaker: An Audio-Visual Dataset for Active Speaker Detection. arXiv preprint arXiv:1901.01342(2019).","key":"e_1_3_2_1_19_1","DOI":"10.1109\/ICCVW.2019.00460"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_20_1","DOI":"10.1109\/TCSVT.2008.2009262"},{"volume-title":"Proceedings of the British Machine Vision Conference. 1\u201311","author":"Son S.","unstructured":"J.\u00a0 S. Son and A. Zisserman . 2017. Lip Reading in Profile . In Proceedings of the British Machine Vision Conference. 1\u201311 . J.\u00a0S. Son and A. Zisserman. 2017. Lip Reading in Profile. In Proceedings of the British Machine Vision Conference. 1\u201311.","key":"e_1_3_2_1_21_1"},{"volume-title":"Proceedings of the International Conference on Pattern Recognition. 10433\u201310440","author":"Stefanov K.","unstructured":"K. Stefanov , M. Adiban , and G. Salvi . 2021. Spatial Bias in Vision-Based Voice Activity Detection . In Proceedings of the International Conference on Pattern Recognition. 10433\u201310440 . K. Stefanov, M. Adiban, and G. Salvi. 2021. Spatial Bias in Vision-Based Voice Activity Detection. In Proceedings of the International Conference on Pattern Recognition. 10433\u201310440.","key":"e_1_3_2_1_22_1"},{"volume-title":"Proceedings of the International Conference on Language Resources and Evaluation.","author":"Stefanov K.","unstructured":"K. Stefanov and J. Beskow . 2016. A Multi-Party Multi-Modal Dataset for Focus of Visual Attention in Human-Human and Human-Robot Interaction . In Proceedings of the International Conference on Language Resources and Evaluation. K. Stefanov and J. Beskow. 2016. A Multi-Party Multi-Modal Dataset for Focus of Visual Attention in Human-Human and Human-Robot Interaction. In Proceedings of the International Conference on Language Resources and Evaluation.","key":"e_1_3_2_1_23_1"},{"volume-title":"Proceedings of the Grounding Language Understanding. 47\u201351","author":"Stefanov K.","unstructured":"K. Stefanov , J. Beskow , and G. Salvi . 2017. Vision-Based Active Speaker Detection in Multiparty Interaction . In Proceedings of the Grounding Language Understanding. 47\u201351 . K. Stefanov, J. Beskow, and G. Salvi. 2017. Vision-Based Active Speaker Detection in Multiparty Interaction. In Proceedings of the Grounding Language Understanding. 47\u201351.","key":"e_1_3_2_1_24_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_25_1","DOI":"10.1109\/TCDS.2019.2927941"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_26_1","DOI":"10.1145\/3323231"},{"volume-title":"Proceedings of the Advancements in Social Signal Processing for Multimodal Interaction. 22\u201327","author":"Stefanov K.","unstructured":"K. Stefanov , A. Sugimoto , and J. Beskow . 2016. Look Who\u2019s Talking: Visual Identification of the Active Speaker in Multi-Party Human-Robot Interaction . In Proceedings of the Advancements in Social Signal Processing for Multimodal Interaction. 22\u201327 . K. Stefanov, A. Sugimoto, and J. Beskow. 2016. Look Who\u2019s Talking: Visual Identification of the Active Speaker in Multi-Party Human-Robot Interaction. In Proceedings of the Advancements in Social Signal Processing for Multimodal Interaction. 22\u201327.","key":"e_1_3_2_1_27_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_28_1","DOI":"10.1109\/TASL.2006.878256"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_29_1","DOI":"10.1109\/TCSVT.2008.2005602"}],"event":{"sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"],"acronym":"ICMI '21","name":"ICMI '21: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION","location":"Montreal QC Canada"},"container-title":["Companion Publication of the 2021 International Conference on Multimodal Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3461615.3485430","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3461615.3485430","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3461615.3485430","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T17:45:10Z","timestamp":1750268710000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3461615.3485430"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,18]]},"references-count":29,"alternative-id":["10.1145\/3461615.3485430","10.1145\/3461615"],"URL":"https:\/\/doi.org\/10.1145\/3461615.3485430","relation":{},"subject":[],"published":{"date-parts":[[2021,10,18]]},"assertion":[{"value":"2021-12-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}