{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:27:07Z","timestamp":1740122827897,"version":"3.37.3"},"reference-count":44,"publisher":"Springer Science and Business Media LLC","issue":"20","license":[{"start":{"date-parts":[[2018,4,11]],"date-time":"2018-04-11T00:00:00Z","timestamp":1523404800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100002878","name":"Consejer\u00eda de Econom\u00eda, Innovaci\u00f3n, Ciencia y Empleo, Junta de Andaluc\u00eda","doi-asserted-by":"publisher","award":["2010-TIC6762"],"award-info":[{"award-number":["2010-TIC6762"]}],"id":[{"id":"10.13039\/501100002878","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003329","name":"Ministerio de Econom\u00eda y Competitividad","doi-asserted-by":"publisher","award":["TEC2015-67387-C4-2-R"],"award-info":[{"award-number":["TEC2015-67387-C4-2-R"]}],"id":[{"id":"10.13039\/501100003329","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2018,10]]},"DOI":"10.1007\/s11042-018-5944-2","type":"journal-article","created":{"date-parts":[[2018,4,11]],"date-time":"2018-04-11T03:36:48Z","timestamp":1523417808000},"page":"27685-27707","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Multimodal speaker diarization for meetings using volume-evaluated SRP-PHAT and video analysis"],"prefix":"10.1007","volume":"77","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2452-6037","authenticated-orcid":false,"given":"P.","family":"Caba\u00f1as-Molero","sequence":"first","affiliation":[]},{"given":"M.","family":"Lucena","sequence":"additional","affiliation":[]},{"given":"J. M.","family":"Fuertes","sequence":"additional","affiliation":[]},{"given":"P.","family":"Vera-Candeas","sequence":"additional","affiliation":[]},{"given":"N.","family":"Ruiz-Reyes","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,4,11]]},"reference":[{"key":"5944_CR1","doi-asserted-by":"crossref","unstructured":"Ajmera J, Lathoud G, McCowan L (2004) Clustering and segmenting speakers and their locations in meetings. In: IEEE international conference on acoustics, speech, and signal processing (ICASSP), vol 1, pp 605\u2013608","DOI":"10.1109\/ICASSP.2004.1326058"},{"issue":"2","key":"5944_CR2","doi-asserted-by":"publisher","first-page":"356","DOI":"10.1109\/TASL.2011.2125954","volume":"20","author":"X Anguera","year":"2012","unstructured":"Anguera X, Bozonnet S, Evans N, Fredouille C, Friedland G, Vinyals O (2012) Speaker diarization: a review of recent research. IEEE Trans Audio Speech Lang Process 20(2):356\u2013370","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"5944_CR3","doi-asserted-by":"crossref","unstructured":"Araki S, Hori T, Fujimoto M, Watanabe S, Yoshioka T, Nakatani T, Nakamura A (2010) Online meeting recognizer with multichannel speaker diarization. In: 44th ASILOMAR conference on signals, systems and computers, pp 1697\u20131701","DOI":"10.1109\/ACSSC.2010.5757829"},{"key":"5944_CR4","doi-asserted-by":"crossref","unstructured":"Araki S, Okada M, Higuchi T, Ogawa A, Nakatani T (2016) Spatial correlation model based observation vector clustering and MVDR beamforming for meeting recognition. In: IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 385\u2013389","DOI":"10.1109\/ICASSP.2016.7471702"},{"key":"5944_CR5","unstructured":"Aubrey A, Rivet B, Hicks Y, Girin L, Chambers J, Jutten C (2007) Two novel visual voice activity detectors based on appearance models and retinal filtering. In: 15th european signal processing conference (EUSIPCO), pp 2409\u20132413"},{"key":"5944_CR6","doi-asserted-by":"crossref","unstructured":"Bergh TF, Hafizovic I, Holm S (2016) Multi-speaker voice activity detection using a camera-assisted microphone array. In: 23rd international conference on systems, signals and image processing (IWSSIP), pp 1\u20134","DOI":"10.1109\/IWSSIP.2016.7502768"},{"key":"5944_CR7","doi-asserted-by":"publisher","first-page":"465","DOI":"10.1007\/978-3-319-39627-9_41","volume-title":"Intelligent Decision Technologies 2016","author":"Giorgio Biagetti","year":"2016","unstructured":"Biagetti G, Crippa P, Falaschetti L, Orcioni S, Turchetti C (2016) Robust speaker identification in a meeting with short audio segments, pp 465\u2013477. Springer International Publishing, Cham"},{"issue":"4","key":"5944_CR8","doi-asserted-by":"publisher","first-page":"373","DOI":"10.1016\/j.patrec.2011.09.002","volume":"33","author":"DA Blauth","year":"2012","unstructured":"Blauth DA, Minotto VP, Jung CR, Lee B, Kalker T (2012) Voice activity detection and speaker localization using audiovisual cues. Pattern Recogn Lett 33(4):373\u2013380","journal-title":"Pattern Recogn Lett"},{"key":"5944_CR9","unstructured":"Carletta J, Ashby S, Bourban S, Flynn M, Guillemot M, Hain T, Kadlec J, Karaiskos V, Kraaij W, Kronenthal M, Lathoud G, Lincoln M, Lisowska A, McCowan I, Post W, Reidsma D, Wellner P (2005) The AMI meeting corpus: a pre-announcement. In: International workshop on machine learning for multimodal interaction. Springer, pp 28\u201339"},{"issue":"1","key":"5944_CR10","doi-asserted-by":"publisher","first-page":"71","DOI":"10.1109\/LSP.2010.2091502","volume":"18","author":"M Cobos","year":"2011","unstructured":"Cobos M, Marti A, Lopez JJ (2011) A modified SRP-PHAT functional for robust real-time sound source localization with scalable spatial sampling. IEEE Signal Processing Letters 18(1):71\u201374","journal-title":"IEEE Signal Processing Letters"},{"key":"5944_CR11","unstructured":"DiBiase JH (2000) A high-accuracy, low-latency technique for talker localization in reverberant environments. Ph.D. thesis, Brown University, Providence, RI"},{"key":"5944_CR12","doi-asserted-by":"crossref","unstructured":"Do H, Silverman HF, Yu Y (2007) A real-time SRP-PHAT source location implementation using stochastic region contraction (SRC) on a large-aperture microphone array. In: IEEE International conference on acoustics, speech and signal processing (ICASSP), vol 1, pp 121\u2013124","DOI":"10.1109\/ICASSP.2007.366631"},{"key":"5944_CR13","unstructured":"Fredouille C, Bozonnet S, Evans N (2009) The LIA-EURECOM RT\u201909 speaker diarization system. In: RT\u201909 NIST Rich transcription workshop, vol 15, pp 17\u201323"},{"key":"5944_CR14","doi-asserted-by":"crossref","unstructured":"Friedland G, Hung H, Yeo C (2009) Multi-modal speaker diarization of real-world meetings using compressed-domain video features. In: IEEE International conference on acoustics, speech and signal processing (ICASSP), pp 4069\u20134072","DOI":"10.1109\/ICASSP.2009.4960522"},{"issue":"2","key":"5944_CR15","doi-asserted-by":"publisher","first-page":"371","DOI":"10.1109\/TASL.2011.2158419","volume":"20","author":"G Friedland","year":"2012","unstructured":"Friedland G, Janin A, Imseng D, Anguera X, Gottlieb L, Huijbregts M, Knox MT, Vinyals O (2012) The ICSI RT-09 speaker diarization system. IEEE Trans Audio Speech Lang Process 20(2):371\u2013381","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"5944_CR16","doi-asserted-by":"crossref","unstructured":"Fujimoto M, Ishizuka K, Nakatani T (2009) A study of mutual front-end processing method based on statistical model for noise robust speech recognition. In: 10Th annual conference of the international speech communication association (INTERSPEECH), pp 1235\u20131238","DOI":"10.21437\/Interspeech.2009-356"},{"issue":"5","key":"5944_CR17","doi-asserted-by":"publisher","first-page":"1086","DOI":"10.1109\/TPAMI.2017.2648793","volume":"40","author":"Israel D. Gebru","year":"2018","unstructured":"Gebru I, Ba S, Li X, Horaud R (2017) Audio-visual speaker diarization based on spatiotemporal bayesian fusion. IEEE Transactions on Pattern Analysis and Machine Intelligence. https:\/\/doi.org\/10.1109\/TPAMI.2017.2648793","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"5944_CR18","doi-asserted-by":"crossref","unstructured":"Ghaemmaghami H, Baker BJ, Vogt RJ, Sridharan S (2010) Noise robust voice activity detection using features extracted from the time-domain autocorrelation function. In: 11th annual conference of the international speech communication association (INTERSPEECH), pp 3118\u20133121","DOI":"10.21437\/Interspeech.2010-776"},{"issue":"2","key":"5944_CR19","doi-asserted-by":"publisher","first-page":"518","DOI":"10.1109\/TASLP.2013.2295918","volume":"22","author":"S Gonzalez","year":"2014","unstructured":"Gonzalez S, Brookes M (2014) PEFAC - a pitch estimation algorithm robust to high levels of noise. IEEE\/ACM Trans Audio Speech Lang Process 22(2):518\u2013530","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"issue":"2","key":"5944_CR20","doi-asserted-by":"publisher","first-page":"499","DOI":"10.1109\/TASL.2011.2164527","volume":"20","author":"T Hori","year":"2012","unstructured":"Hori T, Araki S, Yoshioka T, Fujimoto M, Watanabe S, Oba T, Ogawa A, Otsuka K, Mikami D, Kinoshita K, Nakatani T, Nakamura A, Yamato J (2012) Low-latency real-time meeting recognition and understanding using distant microphones and omni-directional camera. IEEE Trans Audio Speech Lang Process 20(2):499\u2013513","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"5944_CR21","unstructured":"Hung H, Friedland G (2008) Towards audio-visual on-line diarization of participants in group meetings. In: Workshop on multi-camera and multi-modal sensor fusion algorithms and applications"},{"key":"5944_CR22","unstructured":"Liu Q, Wang W, Jackson P (2011) A visual voice activity detection method with adaboosting. In: Sensor signal processing for defence (SSPD), pp 1\u20135"},{"key":"5944_CR23","unstructured":"Liu Y, Nie L, Han L, Zhang L, Rosenblum DS (2015) Action2activity: Recognizing complex activities from sensor data. In: International joint conference on artificial intelligence (IJCAI), pp 1617\u20131623"},{"key":"5944_CR24","doi-asserted-by":"crossref","unstructured":"Liu Y, Zhang L, Nie L, Yan Y, Rosenblum DS (2016) Fortune teller: Predicting your career path. In: Proceedings of the AAAI conference on artificial intelligence, pp 201\u2013207","DOI":"10.1609\/aaai.v30i1.9969"},{"key":"5944_CR25","unstructured":"Liu Y, Zheng Y, Liang Y, Liu S, Rosenblum DS (2016) Urban water quality prediction based on multi-task multi-view learning. In: International joint conference on artificial intelligence (IJCAI)"},{"key":"5944_CR26","doi-asserted-by":"crossref","unstructured":"Marti A, Cobos M, Lopez JJ (2011) Real time speaker localization and detection system for camera steering in multiparticipant videoconferencing environments. In: IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 2592\u20132595","DOI":"10.1109\/ICASSP.2011.5947015"},{"key":"5944_CR27","unstructured":"McCowan I, Carletta J, Kraaij W, Ashby S, Bourban S, Flynn M, Guillemot M, Hain T, Kadlec J, Karaiskos V, Kronenthal M, Lathoud G, Lincoln M, Lisowska A, Post W, Reidsma D, Wellner P (2005) The AMI meeting corpus. In: 5th international conference on methods and techniques in behavioral research, pp 137\u2013140"},{"issue":"1","key":"5944_CR28","doi-asserted-by":"publisher","first-page":"147","DOI":"10.1109\/JSTSP.2012.2237379","volume":"7","author":"VP Minotto","year":"2013","unstructured":"Minotto VP, Lopes CBO, Scharcanski J, Jung CR, Lee B (2013) Audiovisual voice activity detection based on microphone arrays and color information. IEEE Journal of Selected Topics in Signal Processing 7(1):147\u2013156","journal-title":"IEEE Journal of Selected Topics in Signal Processing"},{"issue":"4","key":"5944_CR29","doi-asserted-by":"publisher","first-page":"1032","DOI":"10.1109\/TMM.2014.2305632","volume":"16","author":"VP Minotto","year":"2014","unstructured":"Minotto VP, Jung CR, Lee B (2014) Simultaneous-speaker voice activity detection and localization using mid-fusion of svm and hmms. IEEE Trans Multimedia 16(4):1032\u20131044","journal-title":"IEEE Trans Multimedia"},{"issue":"10","key":"5944_CR30","doi-asserted-by":"publisher","first-page":"1694","DOI":"10.1109\/TMM.2015.2463722","volume":"17","author":"VP Minotto","year":"2015","unstructured":"Minotto VP, Jung CR, Lee B (2015) Multimodal multi-channel on-line speaker diarization using sensor fusion through SVM. IEEE Trans Multimedia 17(10):1694\u20131705","journal-title":"IEEE Trans Multimedia"},{"issue":"1","key":"5944_CR31","doi-asserted-by":"publisher","first-page":"79","DOI":"10.1109\/TPAMI.2011.47","volume":"34","author":"A Noulas","year":"2012","unstructured":"Noulas A, Englebienne G, Krose BJ (2012) Multimodal speaker diarization. IEEE Trans Pattern Anal Mach Intell 34(1):79\u201393","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"4","key":"5944_CR32","doi-asserted-by":"publisher","first-page":"322","DOI":"10.4304\/jmm.5.4.322-331","volume":"5","author":"V Rozgic","year":"2010","unstructured":"Rozgic V, Han KJ, Georgiou PG, Narayanan S (2010) Multimodal speaker segmentation and identification in presence of overlapped speech segments. Journal of Multimedia 5(4):322\u2013331","journal-title":"Journal of Multimedia"},{"issue":"1","key":"5944_CR33","doi-asserted-by":"publisher","first-page":"115","DOI":"10.1007\/s11042-014-2274-x","volume":"75","author":"N Sarafianos","year":"2016","unstructured":"Sarafianos N, Giannakopoulos T, Petridis S (2016) Audio-visual speaker diarization using fisher linear semi-discriminant analysis. Multimed Tools Appl 75(1):115\u2013130","journal-title":"Multimed Tools Appl"},{"key":"5944_CR34","doi-asserted-by":"crossref","unstructured":"Schmalenstroeer J, Kelling M, Leutnant V, Haeb-Umbach R (2009) Fusing audio and video information for online speaker diarization. In: 10th annual conference of the international speech communication association (INTERSPEECH), pp 1163\u20131166","DOI":"10.21437\/Interspeech.2009-338"},{"key":"5944_CR35","doi-asserted-by":"crossref","unstructured":"Scott D, Jung CR, Bins J, Said A, Kalker A (2009) Video based VAD using adaptive color information. In: 11Th IEEE international symposium on multimedia, pp 80\u201387","DOI":"10.1109\/ISM.2009.41"},{"key":"5944_CR36","doi-asserted-by":"crossref","unstructured":"Soldi G, Beaugeant C, Evans N (2015) Adaptive and online speaker diarization for meeting data. In: 23Rd european signal processing conference (EUSIPCO), pp 2112\u20132116","DOI":"10.1109\/EUSIPCO.2015.7362757"},{"issue":"2","key":"5944_CR37","doi-asserted-by":"publisher","first-page":"783","DOI":"10.1016\/j.patcog.2011.07.011","volume":"45","author":"P Tiawongsombat","year":"2012","unstructured":"Tiawongsombat P, Jeong MH, Yun JS, You BJ, Oh SR (2012) Robust visual speakingness detection using bi-level HMM. Pattern Recogn 45(2):783\u2013793","journal-title":"Pattern Recogn"},{"key":"5944_CR38","doi-asserted-by":"crossref","unstructured":"Vaquero C, Vinyals O, Friedland G (2010) A hybrid approach to online speaker diarization. In: 11Th annual conference of the international speech communication association (INTERSPEECH), pp 2638\u20132641","DOI":"10.21437\/Interspeech.2010-700"},{"key":"5944_CR39","doi-asserted-by":"crossref","unstructured":"Viola P, Jones M (2001) Rapid object detection using a boosted cascade of simple features. In: Computer vision and pattern recognition (CVPR), vol 1, pp 511\u2013518","DOI":"10.1109\/CVPR.2001.990517"},{"key":"5944_CR40","unstructured":"Wellner P, Flynn M, Guillemot M (2004) Browsing recorded meetings with Ferret. In: International workshop on machine learning for multimodal interaction. Springer, pp 12\u201321"},{"key":"5944_CR41","doi-asserted-by":"crossref","unstructured":"Wooters C, Huijbregts M (2008) The ICSI RT07s speaker diarization system. In: Multimodal technologies for perception of humans: International evaluation workshops CLEAR 2007 and RT 2007. Springer, pp 509\u2013519","DOI":"10.1007\/978-3-540-68585-2_47"},{"key":"5944_CR42","doi-asserted-by":"crossref","unstructured":"Zhang C, Yin P, Rui Y, Cutler R, Viola P (2006) Boosting-based multimodal speaker detection for distributed meetings. In: IEEE 8Th workshop on multimedia signal processing (MMSP), pp 86\u201391","DOI":"10.1109\/MMSP.2006.285274"},{"key":"5944_CR43","doi-asserted-by":"crossref","unstructured":"Zhang C, Zhang Z, Florencio D (2007) Maximum likelihood sound source localization for multiple directional microphones. In: IEEE international conference on acoustics, speech and signal processing (ICASSP), vol 1, pp 125\u2013128","DOI":"10.1109\/ICASSP.2007.366632"},{"issue":"3","key":"5944_CR44","doi-asserted-by":"publisher","first-page":"538","DOI":"10.1109\/TMM.2008.917406","volume":"10","author":"C Zhang","year":"2008","unstructured":"Zhang C, Florencio D, Ba DE, Zhang Z (2008) Maximum likelihood sound source localization and beamforming for directional microphone arrays in distributed meetings. IEEE Trans Multimedia 10(3):538\u2013548","journal-title":"IEEE Trans Multimedia"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11042-018-5944-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-018-5944-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-018-5944-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,18]],"date-time":"2022-08-18T22:37:00Z","timestamp":1660862220000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11042-018-5944-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,4,11]]},"references-count":44,"journal-issue":{"issue":"20","published-print":{"date-parts":[[2018,10]]}},"alternative-id":["5944"],"URL":"https:\/\/doi.org\/10.1007\/s11042-018-5944-2","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"type":"print","value":"1380-7501"},{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2018,4,11]]},"assertion":[{"value":"24 July 2017","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 January 2018","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 March 2018","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 April 2018","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}