{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,28]],"date-time":"2025-09-28T11:13:29Z","timestamp":1759058009501,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":21,"publisher":"ACM","license":[{"start":{"date-parts":[[2016,10,31]],"date-time":"2016-10-31T00:00:00Z","timestamp":1477872000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2016,10,31]]},"DOI":"10.1145\/2993148.2993172","type":"proceedings-article","created":{"date-parts":[[2016,11,1]],"date-time":"2016-11-01T13:46:03Z","timestamp":1478007963000},"page":"312-316","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":21,"title":["Active speaker detection with audio-visual co-training"],"prefix":"10.1145","author":[{"given":"Punarjay","family":"Chakravarty","sequence":"first","affiliation":[{"name":"KU Leuven, Belgium \/ iMinds, Belgium"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jeroen","family":"Zegers","sequence":"additional","affiliation":[{"name":"KU Leuven, Belgium"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tinne","family":"Tuytelaars","sequence":"additional","affiliation":[{"name":"KU Leuven, Belgium \/ iMinds, Belgium"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hugo","family":"Van hamme","sequence":"additional","affiliation":[{"name":"KU Leuven, Belgium"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2016,10,31]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/279943.279962"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/2818346.2820780"},{"key":"e_1_3_2_2_3_1","unstructured":"P. Chakravarty and T. Tuytelaars. Cross-modal supervision for learning active speaker detection in video (http:\/\/arxiv.org\/abs\/1603.08907v1).  P. Chakravarty and T. Tuytelaars. Cross-modal supervision for learning active speaker detection in video (http:\/\/arxiv.org\/abs\/1603.08907v1)."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2000.871073"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2064307"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-22482-4_54"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2008.04.018"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2015.96"},{"key":"e_1_3_2_2_9_1","first-page":"736","volume-title":"INTERSPEECH","author":"Germain F.","year":"2013"},{"key":"e_1_3_2_2_10_1","unstructured":"R. B. Girshick P. F. Felzenszwalb and D. McAllester. Discriminatively trained deformable part models release 5. http:\/\/people.cs.uchicago.edu\/ rbg\/latent-release5\/.  R. B. Girshick P. F. Felzenszwalb and D. McAllester. Discriminatively trained deformable part models release 5. http:\/\/people.cs.uchicago.edu\/ rbg\/latent-release5\/."},{"volume-title":"Odyssey: The Speaker and Language Recognition Workshop","year":"2014","author":"Greenberg C. S.","key":"e_1_3_2_2_11_1"},{"key":"e_1_3_2_2_12_1","first-page":"2141","volume-title":"INTERSPEECH","author":"Hurmalainen A.","year":"2012"},{"volume-title":"Sixteenth Annual Conference of the International Speech Communication Association","year":"2015","author":"Hurmalainen A.","key":"e_1_3_2_2_13_1"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2012.2228476"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1038\/44565"},{"first-page":"562","volume-title":"Advances in Neural Information Processing Systems 13","author":"Lee D. D.","key":"e_1_3_2_2_16_1"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/MMSP.2005.248630"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1023\/A:1011175531609"},{"key":"e_1_3_2_2_19_1","unstructured":"J. Ren Y. Hu Y.-W. Tai C. Wang L. Xu W. Sun and Q. Yan. Look listen and learn - a multimodal lstm for speaker identification.  J. Ren Y. Hu Y.-W. Tai C. Wang L. Xu W. Sun and Q. Yan. Look listen and learn - a multimodal lstm for speaker identification."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.441"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-773"}],"event":{"name":"ICMI '16: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"],"location":"Tokyo Japan","acronym":"ICMI '16"},"container-title":["Proceedings of the 18th ACM International Conference on Multimodal Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2993148.2993172","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/2993148.2993172","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T03:50:02Z","timestamp":1750218602000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2993148.2993172"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016,10,31]]},"references-count":21,"alternative-id":["10.1145\/2993148.2993172","10.1145\/2993148"],"URL":"https:\/\/doi.org\/10.1145\/2993148.2993172","relation":{},"subject":[],"published":{"date-parts":[[2016,10,31]]},"assertion":[{"value":"2016-10-31","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}