{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T04:51:47Z","timestamp":1776919907889,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":14,"publisher":"ACM","license":[{"start":{"date-parts":[[2016,11,12]],"date-time":"2016-11-12T00:00:00Z","timestamp":1478908800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2016,11,12]]},"DOI":"10.1145\/3005467.3005470","type":"proceedings-article","created":{"date-parts":[[2016,11,9]],"date-time":"2016-11-09T20:43:27Z","timestamp":1478724207000},"page":"22-27","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":11,"title":["Look who's talking"],"prefix":"10.1145","author":[{"given":"Kalin","family":"Stefanov","sequence":"first","affiliation":[{"name":"KTH Royal Institute of Technology, Lindstedtsv\u00e4gen, Stockholm, Sweden"}]},{"given":"Akihiro","family":"Sugimoto","sequence":"additional","affiliation":[{"name":"National Institute of Informatics, Hitotsubashi, Chiyoda, Tokyo, Japan"}]},{"given":"Jonas","family":"Beskow","sequence":"additional","affiliation":[{"name":"KTH Royal Institute of Technology, Lindstedtsv\u00e4gen, Stockholm, Sweden"}]}],"member":"320","published-online":{"date-parts":[[2016,11,12]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Automatic Speech\/Non-Speech Classification Using Gestures in Dialogue. In Swedish Language Technology Conference","author":"Alexanderson S.","year":"2014","unstructured":"S. Alexanderson , J. Beskow , and D. House . Automatic Speech\/Non-Speech Classification Using Gestures in Dialogue. In Swedish Language Technology Conference , 2014 . S. Alexanderson, J. Beskow, and D. House. Automatic Speech\/Non-Speech Classification Using Gestures in Dialogue. In Swedish Language Technology Conference, 2014."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2125954"},{"key":"e_1_3_2_1_3_1","author":"Besson P.","year":"2008","unstructured":"P. Besson and M. Kunt . Hypothesis Testing for Evaluating a Multimodal Pattern Recognition Framework Applied to Speaker Detection. Journal of NeuroEngineering and Rehabilitation , 2008 . P. Besson and M. Kunt. Hypothesis Testing for Evaluating a Multimodal Pattern Recognition Framework Applied to Speaker Detection. Journal of NeuroEngineering and Rehabilitation, 2008.","journal-title":"Journal of NeuroEngineering and Rehabilitation"},{"key":"e_1_3_2_1_4_1","volume-title":"Facial Action Coding System: A Technique for the Measurement of Facial Movement","author":"Ekman P.","year":"1978","unstructured":"P. Ekman and W. Friesen . Facial Action Coding System: A Technique for the Measurement of Facial Movement . 1978 . P. Ekman and W. Friesen. Facial Action Coding System: A Technique for the Measurement of Facial Movement. 1978."},{"key":"e_1_3_2_1_5_1","volume-title":"Advances in Neural Information Processing Systems","author":"Fisher J. W.","year":"2000","unstructured":"J. W. Fisher , T. Darrell , W. T. Freeman , and P. Viola . Learning Joint Statistical Models for Audio-Visual Fusion and Segregation . In Advances in Neural Information Processing Systems , 2000 . J. W. Fisher, T. Darrell, W. T. Freeman, and P. Viola. Learning Joint Statistical Models for Audio-Visual Fusion and Segregation. In Advances in Neural Information Processing Systems, 2000."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/1631272.1631301"},{"key":"e_1_3_2_1_7_1","volume-title":"Speech\/Non-Speech Detection in Meetings from Automatically Extracted Low Resolution Visual Features. Technical report","author":"Hung H.","year":"2009","unstructured":"H. Hung and S. O. Ba . Speech\/Non-Speech Detection in Meetings from Automatically Extracted Low Resolution Visual Features. Technical report , 2009 . H. Hung and S. O. Ba. Speech\/Non-Speech Detection in Meetings from Automatically Extracted Low Resolution Visual Features. Technical report, 2009."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.5555\/1760167.1760228"},{"key":"e_1_3_2_1_9_1","volume-title":"A Tutorial on Hidden Markov Models and Selected Applications in Speech Recognition","author":"Rabiner L. R.","year":"1988","unstructured":"L. R. Rabiner . A Tutorial on Hidden Markov Models and Selected Applications in Speech Recognition . In Proceedings of IEEE , 1988 . L. R. Rabiner. A Tutorial on Hidden Markov Models and Selected Applications in Speech Recognition. In Proceedings of IEEE, 1988."},{"key":"e_1_3_2_1_10_1","volume-title":"Advances in Neural Information Processing Systems","author":"Slaney M.","year":"2000","unstructured":"M. Slaney and M. Covell . Facesync: A Linear Operator for Measuring Synchronization of Video Facial Images and Audio Tracks . In Advances in Neural Information Processing Systems , 2000 . M. Slaney and M. Covell. Facesync: A Linear Operator for Measuring Synchronization of Video Facial Images and Audio Tracks. In Advances in Neural Information Processing Systems, 2000."},{"key":"e_1_3_2_1_11_1","volume-title":"Language Resources and Evaluation Conference","author":"Stefanov K.","year":"2016","unstructured":"K. Stefanov and J. Beskow . A Multi-party Multi-modal Dataset for Focus of Visual Attention in Human-human and Human-robot Interaction . In Language Resources and Evaluation Conference , 2016 . K. Stefanov and J. Beskow. A Multi-party Multi-modal Dataset for Focus of Visual Attention in Human-human and Human-robot Interaction. In Language Resources and Evaluation Conference, 2016."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2008.2005602"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.1992.223161"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2008.2007344"}],"event":{"name":"ICMI '16: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION","location":"Tokyo Japan","acronym":"ICMI '16"},"container-title":["Proceedings of the 2nd Workshop on Advancements in Social Signal Processing for Multimodal Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3005467.3005470","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3005467.3005470","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:23:51Z","timestamp":1750220631000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3005467.3005470"}},"subtitle":["visual identification of the active speaker in multi-party human-robot interaction"],"short-title":[],"issued":{"date-parts":[[2016,11,12]]},"references-count":14,"alternative-id":["10.1145\/3005467.3005470","10.1145\/3005467"],"URL":"https:\/\/doi.org\/10.1145\/3005467.3005470","relation":{},"subject":[],"published":{"date-parts":[[2016,11,12]]},"assertion":[{"value":"2016-11-12","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}