{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,3]],"date-time":"2025-12-03T17:43:45Z","timestamp":1764783825598,"version":"3.41.0"},"reference-count":33,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2015,12]]},"DOI":"10.1109\/asru.2015.7404820","type":"proceedings-article","created":{"date-parts":[[2016,2,12]],"date-time":"2016-02-12T13:55:42Z","timestamp":1455285342000},"page":"383-389","source":"Crossref","is-referenced-by-count":5,"title":["Multimodal embedding fusion for robust speaker role recognition in video broadcast"],"prefix":"10.1109","author":[{"given":"Michael","family":"Rouvier","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sebastien","family":"Delecraz","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Benoit","family":"Favre","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Meriem","family":"Bendris","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Frederic","family":"Bechet","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1016\/j.sigpro.2010.08.010"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2005.1415194"},{"article-title":"Icsiboost","year":"2007","author":"favre","key":"ref31"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CBMI.2010.5529907"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-13168-9_34"},{"key":"ref11","article-title":"Deep learning for efficient discriminative parsing","author":"collobert","year":"2011","journal-title":"AISTATS"},{"article-title":"Efficient estimation of word representations in vector space","year":"2013","author":"mikolov","key":"ref12"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1181"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/EUSIPCO.2015.7362751"},{"key":"ref15","article-title":"The kaldi speech recognition toolkit","author":"povey","year":"2012","journal-title":"Idiap-RR Idiap-RR-04-2012 Idiap Rue Marconi 19 Martigny"},{"key":"ref16","first-page":"1097","article-title":"Imagenet classification with deep convolutional neural network","volume":"25","author":"krizhevsky","year":"2012","journal-title":"Advances in neural information processing systems"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654889"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P14-1068"},{"key":"ref28","article-title":"Srilm-an extensible language modeling toolkit","author":"stolcke","year":"2002","journal-title":"InterSpeech"},{"key":"ref4","first-page":"10h","article-title":"Investigation of spontaneous speech characterization applied to speaker role recognition","volume":"11","author":"dufour","year":"2011","journal-title":"Reporter"},{"article-title":"French gigaword","year":"2009","author":"mendon\u00e7a","key":"ref27"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.3115\/1614049.1614070"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2010.5494958"},{"key":"ref29","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2014-503","article-title":"Speaker adaptation of dnn-based asr with i-vectors: Does it actually adapt models to speakers?","author":"rouvier","year":"2014","journal-title":"InterSpeech"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/1878101.1878104"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2011.5947618"},{"key":"ref7","first-page":"717","article-title":"Extracting Phrase Patterns with Minimum Redundancy for Unsupervised Speaker Role Classification","author":"zhang","year":"2010","journal-title":"Human Language Technologies The 2010 Annual Conference of the North American Chapter of the Association for Computational Linguistics"},{"key":"ref2","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2010-684","article-title":"Social role discovery from spoken language using dynamic bayesian networks","author":"yaman","year":"2010","journal-title":"InterSpeech"},{"key":"ref9","doi-asserted-by":"crossref","first-page":"1285","DOI":"10.21437\/Interspeech.2011-430","article-title":"Multi-view approach for speaker turn role labeling in TV broadcast news shows","author":"damnati","year":"2011","journal-title":"InterSpeech"},{"key":"ref1","article-title":"Multimodal deep learning","author":"ngiam","year":"2011","journal-title":"International Conference on Machine Learning (ICML)"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/N15-1016"},{"key":"ref22","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2013-383","article-title":"An open-source state-of-the-art toolbox for broadcast news diarization","author":"rouvier","year":"2013","journal-title":"InterSpeech"},{"key":"ref21","article-title":"The repere corpus: a multimodal corpus for person recognition","author":"giraudel","year":"2012","journal-title":"LREC"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1006\/csla.1996.0013"},{"key":"ref23","article-title":"A global optimization framework for speaker diarization","author":"rouvier","year":"2012","journal-title":"Speaker Odyssey"},{"key":"ref26","article-title":"The epac corpus: Manual and automatic annotations of conversational speech in french broadcast news","author":"esteve","year":"2010","journal-title":"LREC"},{"key":"ref25","doi-asserted-by":"crossref","first-page":"2583","DOI":"10.21437\/Interspeech.2009-680","article-title":"The ester 2 evaluation campaign for the rich transcription of french radio broadcasts","volume":"9","author":"galliano","year":"2009","journal-title":"InterSpeech"}],"event":{"name":"2015 IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)","start":{"date-parts":[[2015,12,13]]},"location":"Scottsdale, AZ","end":{"date-parts":[[2015,12,17]]}},"container-title":["2015 IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/7397480\/7404758\/07404820.pdf?arnumber=7404820","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,1]],"date-time":"2025-06-01T10:58:01Z","timestamp":1748775481000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/7404820\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,12]]},"references-count":33,"URL":"https:\/\/doi.org\/10.1109\/asru.2015.7404820","relation":{},"subject":[],"published":{"date-parts":[[2015,12]]}}}