{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T13:19:57Z","timestamp":1780406397697,"version":"3.54.1"},"reference-count":158,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2010,4,4]],"date-time":"2010-04-04T00:00:00Z","timestamp":1270339200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2010,11]]},"DOI":"10.1007\/s00530-010-0182-0","type":"journal-article","created":{"date-parts":[[2010,4,3]],"date-time":"2010-04-03T05:58:07Z","timestamp":1270274287000},"page":"345-379","source":"Crossref","is-referenced-by-count":955,"title":["Multimodal fusion for multimedia analysis: a survey"],"prefix":"10.1007","volume":"16","author":[{"given":"Pradeep K.","family":"Atrey","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"M. Anwar","family":"Hossain","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Abdulmotaleb","family":"El Saddik","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mohan S.","family":"Kankanhalli","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2010,4,4]]},"reference":[{"key":"182_CR1","unstructured":"PETS: Performance evaluation of tracking and surveillance (Last access date 31 August 2009). http:\/\/www.cvg.rdg.ac.uk\/slides\/pets.html"},{"key":"182_CR2","unstructured":"TRECVID data availability (Last access date 02 September 2009). http:\/\/www-nlpir.nist.gov\/projects\/trecvid\/trecvid.data.html"},{"issue":"2","key":"182_CR3","first-page":"170","volume":"2003","author":"W. Adams","year":"2003","unstructured":"Adams, W., Iyengar, G., Lin, C., Naphade, M., Neti, C., Nock, H., Smith, J.: Semantic indexing of multimedia content using visual, audio, and text cues. EURASIP J. Appl. Signal Process. 2003(2), 170\u2013185 (2003)","journal-title":"EURASIP J. Appl. Signal Process."},{"key":"182_CR4","doi-asserted-by":"crossref","unstructured":"Aguilar, J.F., Garcia, J.O., Romero, D.G., Rodriguez, J.G.: A comparative evaluation of fusion strategies for multimodal biometric verification. In: International Conference on Video-Based Biometrie Person Authentication, pp. 830\u2013837. Guildford (2003)","DOI":"10.1007\/3-540-44887-X_96"},{"issue":"11","key":"182_CR5","doi-asserted-by":"crossref","first-page":"2025","DOI":"10.1109\/JPROC.2006.886017","volume":"94","author":"P.S. Aleksic","year":"2006","unstructured":"Aleksic, P.S., Katsaggelos, A.K.: Audio-visual biometrics. Proc. IEEE 94(11), 2025\u20132044 (2006)","journal-title":"Proc. IEEE"},{"key":"182_CR6","doi-asserted-by":"crossref","unstructured":"Andrieu, C., Doucet, A., Singh, S., Tadic, V.: Particle methods for change detection, system identification, and control. Proc. IEEE 92(3), 423\u2013438 (2004)","DOI":"10.1109\/JPROC.2003.823142"},{"key":"182_CR7","doi-asserted-by":"crossref","unstructured":"Argillander, J., Iyengar, G., Nock, H.: Semantic annotation of multimedia using maximum entropy models. In: International Conference on Accoustic, Speech and Signal Processing, pp. II\u2013153\u2013156. Philadelphia (2005)","DOI":"10.1109\/ICASSP.2005.1415364"},{"issue":"3","key":"182_CR8","doi-asserted-by":"crossref","first-page":"239","DOI":"10.1007\/s00530-006-0063-8","volume":"12","author":"P.K. Atrey","year":"2006","unstructured":"Atrey, P.K., Kankanhalli, M.S., Jain, R.: Information assimilation framework for event detection in multimedia surveillance systems. Springer\/ACM Multimed. Syst. J. 12(3), 239\u2013253 (2006)","journal-title":"Springer\/ACM Multimedia Syst. J."},{"key":"182_CR9","doi-asserted-by":"crossref","unstructured":"Atrey, P.K., Kankanhalli, M.S., Oommen, J.B.: Goal-oriented optimal subset selection of correlated multimedia streams. ACM Trans. Multimedia Comput. Commun. Appl. 3(1), 2 (2007)","DOI":"10.1145\/1198302.1198304"},{"key":"182_CR10","doi-asserted-by":"crossref","unstructured":"Atrey, P.K., Kankanhalli, M.S., El Saddik, A.: Confidence building among correlated streams in multimedia surveillance systems. In: International Conference on Multimedia Modeling, pp. 155\u2013164. Singapore (2007)","DOI":"10.1007\/978-3-540-69429-8_16"},{"key":"182_CR11","doi-asserted-by":"crossref","unstructured":"Ayache, S., Qu\u00e9not, G., Gensel, J.: Classifier fusion for svm-based multimedia semantic indexing. In: The 29th European Conference on Information Retrieval Research, pp. 494\u2013504. Rome (2007)","DOI":"10.1007\/978-3-540-71496-5_44"},{"key":"182_CR12","doi-asserted-by":"crossref","first-page":"68","DOI":"10.1109\/6046.985555","volume":"4","author":"N. Babaguchi","year":"2002","unstructured":"Babaguchi, N., Kawai, Y., Kitahashi, T.: Event based indexing of broadcasted sports video by intermodal collaboration. IEEE Trans. Multimed. 4, 68\u201375 (2002)","journal-title":"IEEE Trans. Multimed."},{"issue":"4","key":"182_CR13","doi-asserted-by":"crossref","first-page":"575","DOI":"10.1109\/TMM.2004.830811","volume":"6","author":"N. Babaguchi","year":"2004","unstructured":"Babaguchi, N., Kawai, Y., Ogura, T., Kitahashi, T.: Personalized abstraction of broadcasted american football video by highlight selection. IEEE Trans. Multimed. 6(4), 575\u2013586 (2004)","journal-title":"IEEE Trans. Multimed."},{"key":"182_CR14","doi-asserted-by":"crossref","unstructured":"Bailly-Bailli\u00e9re, E., Bengio, S., Bimbot, F., Hamouz, M., Kittler, J., Mari\u00e9thoz, J., Matas, J., Messer, K., Popovici, V., Por\u00e9e, F., Ru\u00edz, B., Thiran, J.P.: The BANCA database and evaluation protocol. In: International Conference on Audio-and Video-Based Biometrie Person Authentication, pp. 625\u2013638. Guildford (2003)","DOI":"10.1007\/3-540-44887-X_74"},{"key":"182_CR15","doi-asserted-by":"crossref","first-page":"828","DOI":"10.1109\/TPAMI.2003.1206512","volume":"25","author":"M.J. Beal","year":"2003","unstructured":"Beal, M.J., Jojic, N., Attias, H.: A graphical model for audio-visual object tracking. IEEE Trans. Pattern Anal. Mach. Intell. 25, 828\u2013 836 (2003)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"8","key":"182_CR16","doi-asserted-by":"crossref","first-page":"1789","DOI":"10.1109\/36.942557","volume":"39","author":"A. Bendjebbour","year":"2001","unstructured":"Bendjebbour, A., Delignon, Y., Fouque, L., Samson, V., Pieczynski, W.: Multisensor image segmentation using Dempster\u2013Shafer fusion in markov fields context. IEEE Trans. Geosci. Remote Sens. 39(8), 1789\u20131798 (2001)","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"182_CR17","doi-asserted-by":"crossref","unstructured":"Bengio, S.: Multimodal authentication using asynchronous hmms. In: The 4th International Conference Audio and Video Based Biometric Person Authentication, pp. 770\u2013777. Guildford (2003)","DOI":"10.1007\/3-540-44887-X_89"},{"issue":"4","key":"182_CR18","doi-asserted-by":"crossref","first-page":"267","DOI":"10.1016\/S1566-2535(02)00089-1","volume":"3","author":"S. Bengio","year":"2002","unstructured":"Bengio, S., Marcel, C., Marcel, S., Mariethoz, J. Confidence measures for multimodal identity verification. Inf. Fusion 3(4), 267\u2013276 (2002)","journal-title":"Inf. Fusion"},{"key":"182_CR19","doi-asserted-by":"crossref","unstructured":"Bredin, H., Chollet, G.: Audio-visual speech synchrony measure for talking-face identity verification. In: IEEE International Conference on Acoustics, Speech and Signal Processing, vol. 2, pp. 233\u2013236. Paris (2007)","DOI":"10.1109\/ICASSP.2007.366215"},{"key":"182_CR20","doi-asserted-by":"crossref","unstructured":"Bredin, H., Chollet, G.: Audiovisual speech synchrony measure: application to biometrics. EURASIP J. Adv. Signal Process. 11\u00a0p. (2007). Article ID 70186","DOI":"10.1155\/2007\/70186"},{"key":"182_CR21","unstructured":"Br\u00e9mond, F., Thonnat, M.: A context representation of surveillance systems. In: European Conference on Computer Vision. Orlando (1996)"},{"key":"182_CR22","volume-title":"Multi-sensor Fusion: Fundamentals and Applications with Software","author":"R.R. Brooks","year":"1998","unstructured":"Brooks, R.R., Iyengar, S.S.: Multi-sensor Fusion: Fundamentals and Applications with Software. Prentice Hall PTR, Upper Saddle River, NJ (1998)"},{"issue":"2","key":"182_CR23","doi-asserted-by":"crossref","first-page":"121","DOI":"10.1023\/A:1009715923555","volume":"2","author":"C.J.C. Burges","year":"1998","unstructured":"Burges, C.J.C.: A tutorial on support vector machines for pattern recognition. Data Min. Knowl. Discov. 2(2), 121\u2013167 (1998)","journal-title":"Data Min. Knowl. Discov."},{"key":"182_CR24","doi-asserted-by":"crossref","unstructured":"Caruana, R., Munson, A., Niculescu-Mizil, A.: Getting the most out of ensemble selection. In: ACM International Conference on on Data Mining, pp. 828\u2013833. Maryland (2006)","DOI":"10.1109\/ICDM.2006.76"},{"key":"182_CR25","doi-asserted-by":"crossref","first-page":"187","DOI":"10.1023\/A:1023622605600","volume":"6","author":"L. Chaisorn","year":"2003","unstructured":"Chaisorn, L., Chua, T.S., Lee, C.H., Zhao, Y., Xu, H., Feng, H., Tian, Q.: A multi-modal approach to story segmentation for news video. World Wide Web 6, 187\u2013208 (2003)","journal-title":"World Wide Web"},{"key":"182_CR26","doi-asserted-by":"crossref","unstructured":"Chang, S.F., Manmatha, R., Chua, T.S.: Combining text and audio-visual features in video indexing. In: IEEE International Conference on Acoustics, Speech, and Signal Processing, vol. 5, pp. 1005\u20131008. IEEE Computer Society, Philadelphia (2005)","DOI":"10.1109\/ICASSP.2005.1416476"},{"key":"182_CR27","doi-asserted-by":"crossref","unstructured":"Chen, Q., Aickelin, U.: Anomaly detection using the dempster\u2013shafer method. In: International Conference on Data Mining, pp. 232\u2013240. Las Vegas (2006)","DOI":"10.2139\/ssrn.2831339"},{"key":"182_CR28","unstructured":"Chetty, G., Wagner, M.: Audio-visual multimodal fusion for biometric person authentication and liveness verification. In: NICTA-HCSNet Multimodal User Interaction Workshop, pp. 17\u201324. Sydney (2006)"},{"key":"182_CR29","doi-asserted-by":"crossref","unstructured":"Chieu, H.L., Lee, Y.K.: Query based event extraction along a timeline. In: International ACM Conference on Research and Development in Information Retrieval, pp. 425\u2013432. Sheffield (2004)","DOI":"10.1145\/1008992.1009065"},{"key":"182_CR30","doi-asserted-by":"crossref","unstructured":"Choudhury, T., Rehg, J.M., Pavlovic, V., Pentland, A.: Boosting and structure learning in dynamic bayesian networks for audio-visual speaker detection. In: The 16th International Conference on Pattern Recognition, vol. 3, pp. 789\u2013794. Quebec (2002)","DOI":"10.1109\/ICPR.2002.1048137"},{"key":"182_CR31","doi-asserted-by":"crossref","unstructured":"Chua, T.S., Chang, S.F., Chaisorn, L., Hsu, W.: Story boundary detection in large broadcast news video archives: techniques, experience and trends. In: ACM International Conference on Multimedia, pp. 656\u2013659. New York, USA (2004)","DOI":"10.1145\/1027527.1027679"},{"key":"182_CR32","unstructured":"Corradini, A., Mehta, M., Bernsen, N., Martin, J., Abrilian, S.: Multimodal input fusion in human\u2013computer interaction. In: NATO-ASI Conference on Data Fusion for Situation Monitoring, Incident Detection, Alert and Response Management. Karlsruhe University, Germany (2003)"},{"issue":"3","key":"182_CR33","doi-asserted-by":"crossref","first-page":"736","DOI":"10.1109\/78.984773","volume":"50","author":"D. Crisan","year":"2002","unstructured":"Crisan, D., Doucet, A.: A survey of convergence results on particle filtering methods for practitioners. IEEE Trans. Signal Process. 50(3), 736\u2013746 (2002)","journal-title":"IEEE Trans. Signal Process."},{"key":"182_CR34","doi-asserted-by":"crossref","unstructured":"Cutler, R., Davis, L.: Look who\u2019s talking: Speaker detection using video and audio correlation. In: IEEE International Conference on Multimedia and Expo, pp. 1589\u20131592. New York City (2000)","DOI":"10.1109\/ICME.2000.871073"},{"key":"182_CR35","doi-asserted-by":"crossref","unstructured":"Darrell, T., Fisher III, J.W., Viola, P., Freeman, W.: Audio-visual segmentation and \u201cthe cocktail party effect\u201d. In: International Conference on Multimodal Interfaces. Bejing (2000)","DOI":"10.1007\/3-540-40063-X_5"},{"key":"182_CR36","doi-asserted-by":"crossref","unstructured":"Datcu, D., Rothkrantz, L.J.M.: Facial expression recognition with relevance vector machines. In: IEEE International Conference on Multimedia and Expo, pp. 193\u2013196. Amsterdam, The Netherlands (2005)","DOI":"10.1109\/ICME.2005.1521393"},{"key":"182_CR37","doi-asserted-by":"crossref","first-page":"417","DOI":"10.1023\/A:1019770124060","volume":"12","author":"R. Debouk","year":"2002","unstructured":"Debouk, R., Lafortune, S., Teneketzis, D.: On an optimal problem in sensor selection. J. Discret. Event Dyn. Syst. Theory Appl. 12, 417\u2013445 (2002)","journal-title":"J. Discret. Event Dyn. Syst. Theory Appl."},{"key":"182_CR38","doi-asserted-by":"crossref","unstructured":"Ding, Y., Fan, G.: Segmental hidden markov models for view-based sport video analysis. In: International Workshop on Semantic Learning Applications in Multimedia. Minneapolis (2007)","DOI":"10.1109\/CVPR.2007.383494"},{"key":"182_CR39","unstructured":"Fisher-III, J., Darrell, T., Freeman, W., Viola, P.: Learning joint statistical models for audio-visual fusion and segregation. In: Advances in Neural Information Processing Systems, pp. 772\u2013778. Denver (2000)"},{"key":"182_CR40","doi-asserted-by":"crossref","unstructured":"Foresti, G.L., Snidaro, L.: A distributed sensor network for video surveillance of outdoor environments. In: IEEE International Conference on Image Processing. Rochester (2002)","DOI":"10.1109\/ICIP.2002.1038076"},{"key":"182_CR41","doi-asserted-by":"crossref","unstructured":"Gandetto, M., Marchesotti, L., Sciutto, S., Negroni, D., Regazzoni, C.S.: From multi-sensor surveillance towards smart interactive spaces. In: IEEE International Conference on Multimedia and Expo, pp. I:641\u2013644. Baltimore (2003)","DOI":"10.1109\/ICME.2003.1220999"},{"key":"182_CR42","doi-asserted-by":"crossref","unstructured":"Garcia Salicetti, S., Beumier, C., Chollet, G., Dorizzi, B., les Jardins, J., Lunter, J., Ni, Y., Petrovska Delacretaz, D.: BIOMET: A multimodal person authentication database including face, voice, fingerprint, hand and signature modalities. In: International Conference on Audio-and Video-Based Biometrie Person Authentication, pp. 845\u2013853. Guildford, UK (2003)","DOI":"10.1007\/3-540-44887-X_98"},{"key":"182_CR43","doi-asserted-by":"crossref","unstructured":"Gehrig, T., Nickel, K., Ekenel, H., Klee, U., McDonough, J.: Kalman filters for audio\u2013video source localization. In: IEEE Workshop on Applications of Signal Processing to Audio and Acoustics, pp. 118\u2013 121. Karlsruhe University, Germany (2005)","DOI":"10.1109\/ASPAA.2005.1540183"},{"key":"182_CR44","unstructured":"Guironnet, M., Pellerin, D., Rombaut, M.: Video classification based on low-level feature fusion model. In: The 13th European Signal Processing Conference. Antalya, Turkey (2005)"},{"key":"182_CR45","doi-asserted-by":"crossref","unstructured":"Hall, D.L., Llinas, J.: An introduction to multisensor fusion. In: Proceedings of the IEEE: Special Issues on Data Fusion, vol. 85, no. 1, pp. 6\u201323 (1997)","DOI":"10.1109\/5.554205"},{"key":"182_CR46","doi-asserted-by":"crossref","unstructured":"Hershey, J., Attias, H., Jojic, N., Krisjianson, T.: Audio visual graphical models for speech processing. In: IEEE International Conference on Speech, Acoustics, and Signal Processing, pp. 649\u2013652. Montreal (2004)","DOI":"10.1109\/ICASSP.2004.1327194"},{"key":"182_CR47","unstructured":"Hershey, J., Movellan, J.: Audio-vision: using audio-visual synchrony to locate sounds. In: Advances in Neural Information Processing Systems, pp. 813\u2013819. MIT Press, USA (2000)"},{"issue":"7","key":"182_CR48","doi-asserted-by":"crossref","first-page":"1527","DOI":"10.1162\/neco.2006.18.7.1527","volume":"18","author":"G.E. Hinton","year":"2006","unstructured":"Hinton, G.E., Osindero, S., Teh, Y.: A fast learning algorithm for deep belief nets. Neural Comput. 18(7), 1527\u20131554 (2006)","journal-title":"Neural Comput."},{"key":"182_CR49","doi-asserted-by":"crossref","unstructured":"Holzapfel, H., Nickel, K., Stiefelhagen, R.: Implementation and evaluation of a constraint-based multimodal fusion system for speech and 3d pointing gestures. In: ACM International Conference on Multimodal Interfaces, pp. 175\u2013182. State College, PA (2004)","DOI":"10.1145\/1027933.1027964"},{"key":"182_CR50","doi-asserted-by":"crossref","unstructured":"Hossain, M.A., Atrey, P.K., El Saddik, A.: Smart mirror for ambient home environment. In: The 3rd IET International Conference on Intelligent Environments, pp. 589\u2013596. Ulm (2007)","DOI":"10.1049\/cp:20070431"},{"key":"182_CR51","doi-asserted-by":"crossref","unstructured":"Hossain, M.A., Atrey, P.K., El Saddik, A.: Modeling and assessing quality of information in multi-sensor multimedia monitoring systems. ACM Trans. Multimed. Comput. Commun. Appl. 7(1) (2011)","DOI":"10.1145\/1870121.1870124"},{"key":"182_CR52","unstructured":"Hsu, W., Kennedy, L., Huang, C.W., Chang, S.F., Lin, C.Y.: News video story segmentation using fusion of multi-level multi-modal features in TRECVID 2003. In: International Conference on Acoustics Speech and Signal Processing. Montreal, QC (2004)"},{"key":"182_CR53","doi-asserted-by":"crossref","unstructured":"Hsu, W.H.M., Chang, S.F.: Generative, discriminative, and ensemble learning on multi-modal perceputal fusion toward news stroy segmentation. In: IEEE International Conference on Multimedia and Expos, pp. 1091\u20131094. Taipei (2004)","DOI":"10.1109\/ICME.2004.1394400"},{"key":"182_CR54","unstructured":"Hu, H., Gan, J.Q.: Sensors and data fusion algorithms in mobile robotics. Technical report, CSM-422, Department of Computer Science, University of Essex, UK (2005)"},{"key":"182_CR55","doi-asserted-by":"crossref","unstructured":"Hua, X.S., Zhang, H.J.: An attention-based decision fusion scheme for multimedia information retrieval. In: The 5th Pacific-Rim Conference on Multimedia. Tokyo, Japan (2004)","DOI":"10.1007\/978-3-540-30542-2_123"},{"key":"182_CR56","doi-asserted-by":"crossref","unstructured":"Isler, V., Bajcsy, R.: The sensor selection problem for bounded uncertainty sensing models. In: International Symposium on Information Processing in Sensor Networks, pp. 151\u2013158. Los Angeles (2005)","DOI":"10.1109\/IPSN.2005.1440917"},{"key":"182_CR57","doi-asserted-by":"crossref","unstructured":"Iyengar, G., Nock, H.J., Neti, C.: Audio-visual synchrony for detection of monologue in video archives. In: IEEE International Conference on Acoustics, Speech, and Signal Processing. Hong Kong (2003)","DOI":"10.1109\/ICME.2003.1220921"},{"key":"182_CR58","doi-asserted-by":"crossref","unstructured":"Iyengar, G., Nock, H.J., Neti, C.: Discriminative model fusion for semantic concept detection and annotation in video. In: ACM International Conference on Multimedia, pp. 255\u2013258. Berkeley (2003)","DOI":"10.1145\/957013.957065"},{"key":"182_CR59","unstructured":"Jaffre, G., Pinquier, J.: Audio\/video fusion: a preprocessing step for multimodal person identification. In: International Workshop on MultiModal User Authentification. Toulouse, France (2006)"},{"key":"182_CR60","doi-asserted-by":"crossref","unstructured":"Jaimes, A., Sebe, N.: Multimodal human computer interaction: a survey. In: IEEE International Workshop on Human Computer Interaction. Beijing (2005)","DOI":"10.1007\/11573425_1"},{"issue":"12","key":"182_CR61","doi-asserted-by":"crossref","first-page":"2270","DOI":"10.1016\/j.patcog.2005.01.012","volume":"38","author":"A. Jain","year":"2005","unstructured":"Jain, A., Nandakumar, K., Ross, A.: Score normalization in multimodal biometric systems. Pattern Recognit. 38(12), 2270\u20132285 (2005)","journal-title":"Pattern Recognit."},{"key":"182_CR62","doi-asserted-by":"crossref","unstructured":"Jasinschi, R.S., Dimitrova, N., McGee, T., Agnihotri, L., Zimmerman, J., Li, D., Louie, J.: A probabilistic layered framework for integrating multimedia content and context information. In: International Conference on Acoustics, Speech and Signal Processing, vol. II, pp. 2057\u20132060. Orlando (2002)","DOI":"10.1109\/ICASSP.2002.1006178"},{"key":"182_CR63","doi-asserted-by":"crossref","unstructured":"Jeon, J., Manmatha, R.: Using maximum entropy for automatic image annotation. In: International Conference on Image and Video Retrieval, vol. 3115, pp. 24\u201332. Dublin (2004)","DOI":"10.1007\/978-3-540-27814-6_7"},{"key":"182_CR64","doi-asserted-by":"crossref","first-page":"369","DOI":"10.1109\/TAC.2003.809144","volume":"48","author":"S. Jiang","year":"2003","unstructured":"Jiang, S., Kumar, R., Garcia, H.E.: Optimal sensor selection for discrete event systems with partial observation. IEEE Trans. Automat. Contr. 48, 369\u2013381 (2003)","journal-title":"IEEE Trans. Automat. Contr."},{"key":"182_CR65","doi-asserted-by":"crossref","unstructured":"Julier, S.J., Uhlmann, J.K.: New extension of the Kalman filter to nonlinear systems. In: Signal Processing, Sensor Fusion, and Target Recognition VI, vol. 3068 SPIE, pp. 182\u2013193. San Diego (1997)","DOI":"10.1117\/12.280797"},{"issue":"Series D","key":"182_CR66","doi-asserted-by":"crossref","first-page":"35","DOI":"10.1115\/1.3662552","volume":"82","author":"R.E. Kalman","year":"1960","unstructured":"Kalman, R.E.: A new approach to linear filtering and prediction problems. Trans. ASME J. Basic Eng. 82(series D), 35\u201345 (1960)","journal-title":"Trans. ASME J. Basic Eng."},{"issue":"5","key":"182_CR67","doi-asserted-by":"crossref","first-page":"937","DOI":"10.1109\/TMM.2006.879876","volume":"8","author":"M.S. Kankanhalli","year":"2006","unstructured":"Kankanhalli, M.S., Wang, J., Jain, R.: Experiential sampling in multimedia systems. IEEE Trans. Multimed. 8(5), 937\u2013946 (2006)","journal-title":"IEEE Trans. Multimed."},{"issue":"5","key":"182_CR68","doi-asserted-by":"crossref","first-page":"947","DOI":"10.1109\/TMM.2006.879875","volume":"8","author":"M.S. Kankanhalli","year":"2006","unstructured":"Kankanhalli, M.S., Wang, J., Jain, R.: Experiential sampling on multiple data streams. IEEE Trans. Multimed. 8(5), 947\u2013955 (2006)","journal-title":"IEEE Trans. Multimed."},{"issue":"3","key":"182_CR69","doi-asserted-by":"crossref","first-page":"226","DOI":"10.1109\/34.667881","volume":"20","author":"J. Kittler","year":"1998","unstructured":"Kittler, J., Hatef, M., Duin, R.P., Matas, J.: On combining classifiers. IEEE Trans. Pattern Anal. Mach. Intell. 20(3), 226\u2013239 (1998)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"182_CR70","doi-asserted-by":"crossref","unstructured":"Lam, K.Y., Cheng, R., Liang, B.Y., Chau, J.: Sensor node selection for execution of continuous probabilistic queries in wireless sensor networks. In: ACM International Workshop on Video Surveillance and Sensor Networks, pp. 63\u201371. NY, USA (2004)","DOI":"10.1145\/1026799.1026811"},{"issue":"10","key":"182_CR71","doi-asserted-by":"crossref","first-page":"2621","DOI":"10.1016\/j.patcog.2007.02.002","volume":"40","author":"T. Le\u00f3n","year":"2007","unstructured":"Le\u00f3n, T., Zuccarello, P., Ayala, G., de Ves, E., Domingo, J.: Applying logistic regression to relevance feedback in image retrieval systems. Pattern Recognit. 40(10), 2621\u20132632 (2007)","journal-title":"Pattern Recognit."},{"key":"182_CR72","doi-asserted-by":"crossref","unstructured":"Li, D., Dimitrova, N., Li, M., Sethi, I.K.: Multimedia content processing through cross-modal association. In: ACM International Conference on Multimedia (2003)","DOI":"10.1145\/957013.957143"},{"key":"182_CR73","doi-asserted-by":"crossref","unstructured":"Li, F.F., Perona, P.: A bayesian hierarchical model for learning natural scene categories. In: IEEE Computer Society Conference on Computer Vision and Pattern Recognition, vol. 2, pp. 524\u2013531. Washington (2005)","DOI":"10.1109\/CVPR.2005.16"},{"key":"182_CR74","doi-asserted-by":"crossref","unstructured":"Li, M., Li, D., Dimitrove, N., Sethi, I.K.: Audio-visual talking face detection. In: International Conference on Multimedia and Expo, pp. 473\u2013476. Baltimore, MD (2003)","DOI":"10.1109\/ICME.2003.1221656"},{"issue":"6","key":"182_CR75","doi-asserted-by":"crossref","first-page":"887","DOI":"10.1016\/j.patcog.2004.11.008","volume":"38","author":"X. Liu","year":"2005","unstructured":"Liu, X., Zhang, L., Li, M., Zhang, H., Wang, D.: Boosting image classification with lda-based feature combination for digital photograph management. Pattern Recognit. 38(6), 887\u2013901 (2005)","journal-title":"Pattern Recognit."},{"key":"182_CR76","doi-asserted-by":"crossref","unstructured":"Liu, Y., Zhang, D., Lu, G., Tan, A.H.: Integrating semantic templates with decision tree for image semantic learning. In: The 13th International Multimedia Modeling Conference, pp. 185\u2013195. Singapore (2007)","DOI":"10.1007\/978-3-540-69429-8_19"},{"key":"182_CR77","doi-asserted-by":"crossref","unstructured":"Loh, A., Guan, F., Ge, S.S.: Motion estimation using audio and video fusion. In: International Conference on Control, Automation, Robotics and Vision, vol. 3, pp. 1569\u20131574 (2004)","DOI":"10.1109\/ICARCV.2004.1469293"},{"key":"182_CR78","doi-asserted-by":"crossref","unstructured":"Lucey, S., Sridharan, S., Chandran, V.: Improved speech recognition using adaptive audio-visual fusion via a stochastic secondary classifier. In: International Symposium on Intelligent Multimedia, Video and Speech Processing, pp. 551\u2013554. Hong Kong (2001)","DOI":"10.1109\/ISIMP.2001.925455"},{"issue":"2","key":"182_CR79","doi-asserted-by":"crossref","first-page":"107","DOI":"10.1109\/JSEN.2002.1000251","volume":"2","author":"R.C. Luo","year":"2002","unstructured":"Luo, R.C., Yih, C.C., Su, K.L.: Multisensor fusion and integration: Approaches, applications, and future research directions. IEEE Sens. J. 2(2), 107\u2013119 (2002)","journal-title":"IEEE Sens. J."},{"key":"182_CR80","doi-asserted-by":"crossref","unstructured":"Magalh\u00e3es, J., R\u00fcger, S.: Information-theoretic semantic multimedia indexing. In: International Conference on Image and Video Retrieval, pp. 619\u2013626. Amsterdam, The Netherlands (2007)","DOI":"10.1145\/1282280.1282368"},{"key":"182_CR81","unstructured":"Makkook, M.A.: A multimodal sensor fusion architecture for audio-visual speech recognition. MS Thesis, University of Waterloo, Canada (2007)"},{"key":"182_CR82","unstructured":"Matas, J., Hamouz, M., Jonsson, K., Kittler, J., Li, Y., Kotropoulos, C., Tefas, A., Pitas, I., Tan, T., Yan, H., Smeraldi, F., Capdevielle, N., Gerstner, W., Abdeljaoued, Y., Bigun, J., Ben-Yacoub, S., Mayoraz, E.: Comparison of face verification results on the XM2VTS database. p. 4858. Los Alamitos, CA, USA (2000)"},{"key":"182_CR83","doi-asserted-by":"crossref","unstructured":"McDonald, K., Smeaton, A.F.: A comparison of score, rank and probability-based fusion methods for video shot retrieval. In: International Conference on Image and Video Retrieval, pp. 61\u201370. Singapore (2005)","DOI":"10.1007\/11526346_10"},{"key":"182_CR84","unstructured":"Mena, J.B., Malpica, J.: Color image segmentation using the dempster\u2013shafer theory of evidence for the fusion of texture. In: International Archives of Photogrammetry, Remote Sensing and Spatial Information Sciences, vol. XXXIV, Part 3\/W8, pp. 139\u2013144. Munich, Germany (2003)"},{"key":"182_CR85","doi-asserted-by":"crossref","first-page":"91","DOI":"10.1016\/j.inffus.2003.07.001","volume":"5","author":"G.F. Meyer","year":"2004","unstructured":"Meyer, G.F., Mulligan, J.B., Wuerger, S.M.: Continuous audio-visual digit recognition using N-best decision fusion. J. Inf. Fusion 5, 91\u2013101 (2004)","journal-title":"J. Inf. Fusion"},{"key":"182_CR86","first-page":"1","volume":"11","author":"A.V. Nefian","year":"2002","unstructured":"Nefian, A.V., Liang, L., Pi, X., Liu, X., Murphye, K.: Dynamic bayesian networks for audio-visual speech recognition. EURASIP J. Appl. Signal Process. 11, 1\u201315 (2002)","journal-title":"EURASIP J. Appl. Signal Process."},{"key":"182_CR87","doi-asserted-by":"crossref","unstructured":"Neti, C., Maison, B., Senior, A., Iyengar, G., Cuetos, P., Basu, S., Verma, A.: Joint processing of audio and visual information for multimedia indexing and human-computer interaction. In: International Conference RIAO. Paris, France (2000)","DOI":"10.21437\/ICSLP.2000-466"},{"key":"182_CR88","unstructured":"Ni, J., , Ma, X., Xu, L., Wang, J.: An image recognition method based on multiple bp neural networks fusion. In: IEEE International Conference on Information Acquisition (2004)"},{"key":"182_CR89","doi-asserted-by":"crossref","unstructured":"Nickel, K., Gehrig, T., Stiefelhagen, R., McDonough, J.: A joint particle filter for audio-visual speaker tracking. In: The 7th International Conference on Multimodal Interfaces, pp. 61\u201368. Torento, Italy (2005)","DOI":"10.1145\/1088463.1088477"},{"key":"182_CR90","doi-asserted-by":"crossref","unstructured":"Nock, H.J., Iyengar, G., Neti, C.: Assessing face and speech consistency for monologue detection in video. In: ACM International Conference on Multimedia. French Riviera, France (2002)","DOI":"10.1145\/641007.641070"},{"key":"182_CR91","doi-asserted-by":"crossref","unstructured":"Nock, H.J., Iyengar, G., Neti, C.: Speaker localisation using audio-visual synchrony: an empirical study. In: International Conference on Image and Video Retrieval. Urbana, USA (2003)","DOI":"10.1007\/3-540-45113-7_48"},{"key":"182_CR92","doi-asserted-by":"crossref","unstructured":"Noulas, A.K., Krose, B.J.A.: Em detection of common origin of multi-modal cues. In: International Conference on Multimodal Interfaces, pp. 201\u2013208. Banff (2006)","DOI":"10.1145\/1180995.1181037"},{"issue":"6","key":"182_CR93","doi-asserted-by":"crossref","first-page":"395","DOI":"10.1049\/ip-vis:20031078","volume":"150","author":"J. Ortega-Garcia","year":"2003","unstructured":"Ortega-Garcia, J., Fierrez-Aguilar, J., Simon, D., Gonzalez, J., Faundez-Zanuy, M., Espinosa, V., Satue, A., Hernaez, I., Igarza, J.J., Vivaracho, C., Escudero, D., Moro, Q.I.: Biometric on the internet MCYT baseline corpus: a bimodal biometric database. IEE Proc. Vis. Image Signal Process. 150(6), 395\u2013401 (2003)","journal-title":"IEE Proc. Vis. Image Signal Process."},{"key":"182_CR94","doi-asserted-by":"crossref","first-page":"307","DOI":"10.1109\/7.272256","volume":"30","author":"Y. Oshman","year":"1994","unstructured":"Oshman, Y.: Optimal sensor selection strategy for discrete-time state estimators. IEEE Trans. Aerosp. Electron. Syst. 30, 307\u2013314 (1994)","journal-title":"IEEE Trans. Aerosp. Electron. Syst."},{"issue":"11","key":"182_CR95","doi-asserted-by":"crossref","first-page":"74","DOI":"10.1145\/319382.319398","volume":"42","author":"S. Oviatt","year":"1999","unstructured":"Oviatt, S.: Ten myths of multimodal interaction. Commun. ACM 42(11), 74\u201381 (1999)","journal-title":"Commun. ACM"},{"issue":"9","key":"182_CR96","doi-asserted-by":"crossref","first-page":"45","DOI":"10.1145\/348941.348979","volume":"43","author":"S. Oviatt","year":"2000","unstructured":"Oviatt, S.: Taming speech recognition errors within a multimodal interface. Commun. ACM 43(9), 45\u201351 (2000)","journal-title":"Commun. ACM"},{"key":"182_CR97","unstructured":"Oviatt, S.L.: Multimodal interfaces. In: Jacko, J., Sears, A. (eds.) The Human\u2013Computer Interaction Handbook: Fundamentals, Evolving Technologies and Emerging Applications. Lawrence Erlbaum Assoc., NJ (2003)"},{"key":"182_CR98","doi-asserted-by":"crossref","unstructured":"Pahalawatta, P., Pappas, T.N., Katsaggelos, A.K.: Optimal sensor selection for video-based target tracking in a wireless sensor network. In: IEEE International Conference on Image Processing, pp. V:3073\u20133076. Singapore (2004)","DOI":"10.1109\/ICIP.2004.1421762"},{"key":"182_CR99","unstructured":"Perez, D.G., Lathoud, G., McCowan, I., Odobez, J.M., Moore, D.: Audio-visual speaker tracking with importance particle filter. In: IEEE International Conference on Image Processing (2003)"},{"key":"182_CR100","doi-asserted-by":"crossref","unstructured":"Pfleger, N.: Context based multimodal fusion. In: ACM International Conference on Multimodal Interfaces, pp. 265\u2013272. State College (2004)","DOI":"10.1145\/1027933.1027977"},{"key":"182_CR101","unstructured":"Pfleger, N.: Fade - an integrated approach to multimodal fusion and discourse processing. In: Dotoral Spotlight at ICMI 2005. Trento, Italy (2005)"},{"key":"182_CR102","doi-asserted-by":"crossref","unstructured":"Pitsikalis, V., Katsamanis, A., Papandreou, G., Maragos, P.: Adaptive multimodal fusion by uncertainty compensation. In: Ninth International Conference on Spoken Language Processing. Pittsburgh (2006)","DOI":"10.21437\/Interspeech.2006-616"},{"key":"182_CR103","doi-asserted-by":"crossref","unstructured":"Poh, N., Bengio, S.: How do correlation and variance of base-experts affect fusion in biometric authentication tasks? IEEE Trans. Signal Process. 53, 4384\u20134396 (2005)","DOI":"10.1109\/TSP.2005.857006"},{"issue":"2","key":"182_CR104","doi-asserted-by":"crossref","first-page":"223","DOI":"10.1016\/j.patcog.2005.06.011","volume":"39","author":"N. Poh","year":"2006","unstructured":"Poh, N., Bengio, S.: Database, protocols and tools for evaluating score-level fusion algorithms in biometric authentication. Pattern Recognit. 39(2), 223\u2013233 (2006) (Part Special Issue: Complexity Reduction)","journal-title":"Pattern Recognit."},{"key":"182_CR105","doi-asserted-by":"crossref","unstructured":"Potamianos, G., Luettin, J., Neti, C.: Hierarchical discriminant features for audio-visual LVSCR. In: IEEE International Conference on Acoustic Speech and Signal Processing, pp. 165\u2013168. Salt Lake City (2001)","DOI":"10.1109\/ICASSP.2001.940793"},{"issue":"9","key":"182_CR106","doi-asserted-by":"crossref","first-page":"1306","DOI":"10.1109\/JPROC.2003.817150","volume":"91","author":"G. Potamianos","year":"2003","unstructured":"Potamianos, G., Neti, C., Gravier, G., Garg, A., Senior, A.: Recent advances in the automatic recognition of audiovisual speech. Proc. IEEE 91(9), 1306\u20131326 (2003)","journal-title":"Proc. IEEE"},{"issue":"5","key":"182_CR107","doi-asserted-by":"crossref","first-page":"520","DOI":"10.1109\/TSA.2004.833004","volume":"12","author":"I. Potamitis","year":"2004","unstructured":"Potamitis, I., Chen, H., Tremoulis, G.: Tracking of multiple moving speakers with multiple microphone arrays. IEEE Trans. Speech Audio Process. 12(5), 520\u2013529 (2004)","journal-title":"IEEE Trans. Speech Audio Process."},{"key":"182_CR108","doi-asserted-by":"crossref","unstructured":"Radova, V., Psutka, J.: An approach to speaker identification using multiple classifiers. In: IEEE International Conference on Acoustics, Speech, and Signal Processing, 2, 1135\u20131138. Munich, Germany (1997)","DOI":"10.1109\/ICASSP.1997.596142"},{"key":"182_CR109","unstructured":"Rashidi, A., Ghassemian, H.: Extended dempster\u2013shafer theory for multi-system\/sensor decision fusion. In: Commission IV Joint Workshop on Challenges in Geospatial Analysis, Integration and Visualization II, pp. 31\u201337. Germany (2003)"},{"key":"182_CR110","unstructured":"Reddy, B.S.: Evidential reasoning for multimodal fusion in human computer interaction (2007). MS Thesis, University of Waterloo, Canada"},{"key":"182_CR111","unstructured":"Ribeiro, M.I.: Kalman and extended Kalman filters: concept, derivation and properties. Technical report., Institute for Systems and Robotics, Lisboa (2004)"},{"issue":"2","key":"182_CR112","doi-asserted-by":"crossref","first-page":"305","DOI":"10.1162\/089976699300016674","volume":"11","author":"S. Roweis","year":"1999","unstructured":"Roweis, S., Ghahramani, Z.: A unifying review of linear gaussian models. Neural Comput. 11(2), 305\u2013345 (1999)","journal-title":"Neural Comput."},{"issue":"5","key":"182_CR113","doi-asserted-by":"crossref","first-page":"449","DOI":"10.1016\/j.dsp.2004.05.001","volume":"14","author":"C. Sanderson","year":"2004","unstructured":"Sanderson, C., Paliwal, K.K.: Identity verification using speech and face information. Digit. Signal Process. 14(5), 449\u2013480 (2004)","journal-title":"Digit. Signal Process."},{"issue":"1","key":"182_CR114","doi-asserted-by":"crossref","first-page":"22","DOI":"10.1109\/93.752960","volume":"6","author":"S. Satoh","year":"1999","unstructured":"Satoh, S., Nakamura, Y., Kanade, T.: Name-It: Naming and detecting faces in news video. IEEE Multimed. 6(1), 22\u201335 (1999)","journal-title":"IEEE Multimed."},{"key":"182_CR115","doi-asserted-by":"crossref","unstructured":"Siegel, M., Wu, H.: Confidence fusion. In: IEEE International Workshop on Robot Sensing, pp. 96\u201399 (2004)","DOI":"10.1109\/ROSE.2004.1317621"},{"issue":"20","key":"182_CR116","doi-asserted-by":"crossref","first-page":"429","DOI":"10.1587\/elex.3.429","volume":"3","author":"R. Singh","year":"2006","unstructured":"Singh, R., Vatsa, M., Noore, A., Singh, S.K.: Dempster\u2013shafer theory based finger print classifier fusion with update rule to minimize training time. IEICE Electron. Express 3(20), 429\u2013435 (2006)","journal-title":"IEICE Electron. Express"},{"key":"182_CR117","unstructured":"Slaney, M., Covell, M.: Facesync: A linear operator for measuring synchronization of video facial images and audio tracks. In: Neural Information Processing Society, vol. 13 (2000)"},{"key":"182_CR118","first-page":"151","volume-title":"Multimedia Content Analysis, Theory and Applications.","author":"A.F. Smeaton","year":"2009","unstructured":"Smeaton, A.F., Over, P., Kraaij, W.: High-level feature detection from video in TRECVid: a 5-year retrospective of achievements. In: Divakaran, A. (ed.) Multimedia Content Analysis, Theory and Applications, pp. 151\u2013174. Springer, Berlin (2009)"},{"key":"182_CR119","doi-asserted-by":"crossref","unstructured":"Snoek, C.G.M., Worring, M.: A review on multimodal video indexing. In: IEEE International Conference on Multimedia and Expo, pp. 21\u201324. Lusanne, Switzerland (2002)","DOI":"10.1109\/ICME.2002.1035364"},{"issue":"1","key":"182_CR120","doi-asserted-by":"crossref","first-page":"5","DOI":"10.1023\/B:MTAP.0000046380.27575.a5","volume":"25","author":"C.G.M. Snoek","year":"2005","unstructured":"Snoek, C.G.M., Worring, M.: Multimodal video indexing: A review of the state-of-the-art. Multimed. Tools Appl. 25(1), 5\u201335 (2005)","journal-title":"Multimed. Tools Appl."},{"key":"182_CR121","doi-asserted-by":"crossref","unstructured":"Snoek, C.G.M., Worring, M., Smeulders, A.W.M.: Early versus late fusion in semantic video analysis. In: ACM International Conference on Multimedia, pp. 399\u2013402. Singapore (2005)","DOI":"10.1145\/1101149.1101236"},{"key":"182_CR122","doi-asserted-by":"crossref","unstructured":"Sridharan, H., Sundaram, H., Rikakis, T.: Computational models for experiences in the arts and multimedia. In: The ACM Workshop on Experiential Telepresence. Berkeley, CA (2003)","DOI":"10.1145\/982484.982490"},{"key":"182_CR123","unstructured":"Stauffer, C.: Automated audio-visual activity analysis. Tech. rep., MIT-CSAIL-TR-2005-057, Massachusetts Institute of Technology, Cambridge, MA (2005)"},{"issue":"1","key":"182_CR124","doi-asserted-by":"crossref","first-page":"22","DOI":"10.1109\/79.911196","volume":"18","author":"N. Strobel","year":"2001","unstructured":"Strobel, N., Spors, S., Rabenstein, R.: Joint audio\u2013video object localization and tracking. IEEE Signal Process. Mag. 18(1), 22\u201331 (2001)","journal-title":"IEEE Signal Process. Mag."},{"key":"182_CR125","doi-asserted-by":"crossref","unstructured":"Talantzis, F., Pnevmatikakis, A., Polymenakos, L.C.: Real time audio-visual person tracking. In: IEEE 8th Workshop on Multimedia Signal Processing, pp. 243\u2013247. IEEE Computer Society, Victoria, BC (2006)","DOI":"10.1109\/MMSP.2006.285306"},{"key":"182_CR126","doi-asserted-by":"crossref","unstructured":"Tatbul, N., Buller, M., Hoyt, R., Mullen, S., Zdonik, S.: Confidence-based data management for personal area sensor networks. In: The Workshop on Data Management for Sensor Networks (2004)","DOI":"10.1145\/1052199.1052204"},{"key":"182_CR127","unstructured":"Tavakoli, A., Zhang, J., Son, S.H.: Group-based event detection in undersea sensor networks. In: Second International Workshop on Networked Sensing Systems. San Diego, CA (2005)"},{"key":"182_CR128","doi-asserted-by":"crossref","first-page":"25","DOI":"10.1023\/A:1008014206206","volume":"20","author":"P. Teissier","year":"1998","unstructured":"Teissier, P., Guerin-Dugue, A., Schwartz, J.L.: Models for audiovisual fusion in a noisy-vowel recognition task. J. VLSI Signal Process. 20, 25\u201344 (1998)","journal-title":"J. VLSI Signal Process."},{"key":"182_CR129","unstructured":"Teriyan, V.Y., Puuronen, S.: Multilevel context representation using semantic metanetwork. In: International and Interdisciplinary Conference on Modeling and Using Context, pp. 21\u201332. Rio de Janeiro, Brazil (1997)"},{"key":"182_CR130","doi-asserted-by":"crossref","unstructured":"Tesic, J., Natsev, A., Lexing, X., Smith, J.R.: Data modeling strategies for imbalanced learning in visual search. In: IEEE International Conference on Multimedia and Expo, pp. 1990\u20131993. Beijing (2007)","DOI":"10.1109\/ICME.2007.4285069"},{"key":"182_CR131","doi-asserted-by":"crossref","first-page":"235","DOI":"10.1007\/s11263-006-7834-8","volume":"71","author":"C. Town","year":"2007","unstructured":"Town, C.: Multi-sensory and multi-modal fusion for sentient computing. Int. J. Comput. Vis. 71, 235\u2013253 (2007)","journal-title":"Int. J. Comput. Vis."},{"key":"182_CR132","doi-asserted-by":"crossref","unstructured":"Vermaak, J., Gangnet, M., Blake, A., Perez, P.: Sequential monte carlo fusion of sound and vision for speaker tracking. In: The 8th IEEE International Conference on Computer Vision, vol. 1, pp. 741\u2013746. Paris, France (2001)","DOI":"10.1109\/ICCV.2001.937600"},{"key":"182_CR133","doi-asserted-by":"crossref","unstructured":"Voorhees, E.M., Gupta, N.K., Johnson-Laird, B.: Learning collection fusion strategies. In: ACM International Conference on Research and Development in Information Retrieval, pp. 172\u2013179. Seattle, WA (1995)","DOI":"10.1145\/215206.215357"},{"key":"182_CR134","first-page":"91","volume-title":"Singular Value Decomposition and Principal Component Analysis, Chap. 5","author":"M.E. Wall","year":"2003","unstructured":"Wall, M.E., Rechtsteiner, A., Rocha, L.M.: Singular Value Decomposition and Principal Component Analysis, Chap. 5, pp. 91\u2013109. Kluwel, Norwell, MA (2003)"},{"key":"182_CR135","doi-asserted-by":"crossref","unstructured":"Wang, J., Kankanhalli, M.S.: Experience-based sampling technique for multimedia analysis. In: ACM International Conference on Multimedia, pp. 319\u2013322. Berkeley, CA (2003)","DOI":"10.1145\/957013.957081"},{"key":"182_CR136","doi-asserted-by":"crossref","unstructured":"Wang, J., Kankanhalli, M.S., Yan, W.Q., Jain, R.: Experiential sampling for video surveillance. In: ACM Workshop on Video Surveillance. Berkeley (2003)","DOI":"10.1145\/982452.982462"},{"issue":"3","key":"182_CR137","doi-asserted-by":"crossref","first-page":"14","DOI":"10.1145\/1236471.1236473","volume":"3","author":"S. Wang","year":"2007","unstructured":"Wang, S., Dash, M., Chia, L.T., Xu, M.: Efficient sampling of training set in large and noisy multimedia data. ACM Trans. Multimed. Comput. Commun. Appl. 3(3), 14 (2007)","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"182_CR138","doi-asserted-by":"crossref","unstructured":"Wang, Y., Liu, Z., Huang, J.C.: Multimedia content analysis: using both audio and visual clues. In: IEEE Signal Processing Magazine, pp. 12\u201336 (2000)","DOI":"10.1109\/79.888862"},{"key":"182_CR139","unstructured":"Westerveld, T.: Image retrieval: content versus context. In: RIAO Content-Based Multimedia Information Access. Paris, France (2000)"},{"key":"182_CR140","unstructured":"Wu, H.: Sensor data fusion for context-aware computing using dempster\u2013shafer theory. Ph.D. thesis, The Robotics Institute, Carnegie Mellon University, Pittsburgh, PA (2003)"},{"key":"182_CR141","unstructured":"Wu, K., Lin, C.K., Chang, E., Smith, J.R.: Multimodal information fusion for video concept detection. In: IEEE International Conference on Image Processing, pp. 2391\u20132394. Singapore (2004)"},{"key":"182_CR142","doi-asserted-by":"crossref","unstructured":"Wu, Y., Chang, E., Tsengh, B.L.: Multimodal metadata fusion using causal strength. In: ACM International Conference on Multimedia, pp. 872\u2013881. Singapore (2005)","DOI":"10.1145\/1101149.1101338"},{"key":"182_CR143","doi-asserted-by":"crossref","unstructured":"Wu, Y., Chang, E.Y., Chang, K.C.C., Smith, J.R.: Optimal multimodal fusion for multimedia data analysis. In: ACM International Conference on Multimedia, pp. 572\u2013579. New York City, NY (2004)","DOI":"10.1145\/1027527.1027665"},{"key":"182_CR144","doi-asserted-by":"crossref","unstructured":"Wu, Z., Cai, L., Meng, H.: Multi-level fusion of audio and visual features for speaker identification. In: International Conference on Advances in Biometrics, pp. 493\u2013499 (2006)","DOI":"10.1007\/11608288_66"},{"key":"182_CR145","doi-asserted-by":"crossref","unstructured":"Xie, L., Kennedy, L., Chang, S.F., Divakaran, A., Sun, H., Lin, C.Y.: Layered dynamic mixture model for pattern discovery in asynchronous multi-modal streams. In: IEEE International Conference on Acoustics, Speech, and Signal Processing, vol. 2, pp. 1053\u20131056. Philadelphia, USA (2005)","DOI":"10.1109\/ICASSP.2005.1415589"},{"key":"182_CR146","doi-asserted-by":"crossref","first-page":"163","DOI":"10.1016\/S1566-2535(02)00055-6","volume":"3","author":"N. Xiong","year":"2002","unstructured":"Xiong, N., Svensson, P.: Multi-sensor management for information fusion: issues and approaches. Inf. Fusion 3, 163\u2013186(24) (2002)","journal-title":"Inf. Fusion"},{"issue":"3","key":"182_CR147","doi-asserted-by":"crossref","first-page":"421","DOI":"10.1109\/TMM.2008.917346","volume":"10","author":"C. Xu","year":"2008","unstructured":"Xu, C., Wang, J., Lu, H., Zhang, Y.: A novel framework for semantic annotation and personalized retrieval of sports video. IEEE Trans. Multimed. 10(3), 421\u2013436 (2008)","journal-title":"IEEE Trans. Multimed."},{"issue":"7","key":"182_CR148","doi-asserted-by":"crossref","first-page":"1342","DOI":"10.1109\/TMM.2008.2004912","volume":"10","author":"C. Xu","year":"2008","unstructured":"Xu, C., Zhang, Y.F., Zhu, G., Rui, Y., Lu, H., Huang, Q.: Using webcast text for semantic event detection in broadcast sports video. IEEE Trans. Multimed. 10(7), 1342\u20131355 (2008)","journal-title":"IEEE Trans. Multimed."},{"issue":"1","key":"182_CR149","doi-asserted-by":"crossref","first-page":"44","DOI":"10.1145\/1126004.1126007","volume":"2","author":"H. Xu","year":"2006","unstructured":"Xu, H., Chua, T.S.: Fusion of AV features and external information sources for event detection in team sports video. ACM Trans. Multimed. Comput. Commun. Appl. 2(1), 44\u201367 (2006)","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"182_CR150","unstructured":"Yan, R.: Probabilistic models for combining diverse knowledge sources in multimedia retrieval. Ph.D. thesis. Carnegie Mellon University (2006)"},{"key":"182_CR151","doi-asserted-by":"crossref","unstructured":"Yan, R., Yang, J., Hauptmann, A.: Learning query-class dependent weights in automatic video retrieval. In: ACM International Conference on Multimedia, pp. 548\u2013555. New York, USA (2004)","DOI":"10.1145\/1027527.1027661"},{"key":"182_CR152","doi-asserted-by":"crossref","first-page":"131","DOI":"10.1002\/ima.20046","volume":"15","author":"M.T. Yang","year":"2005","unstructured":"Yang, M.T., Wang, S.C., Lin, Y.Y.: A multimodal fusion system for people detection and tracking. International Journal of Imaging Systems and Technology 15, 131\u2013142 (2005)","journal-title":"International Journal of Imaging Systems and Technology"},{"issue":"4","key":"182_CR153","doi-asserted-by":"crossref","first-page":"399","DOI":"10.1145\/954339.954342","volume":"35","author":"W. Zhao","year":"2003","unstructured":"Zhao, W., Chellappa, R., Phillips, P.J., Rosenfeld, A. Face recognition: a literature survey. ACM Comput. Surv. 35(4), 399\u2013458 (2003)","journal-title":"ACM Comput. Surv."},{"issue":"11","key":"182_CR154","doi-asserted-by":"crossref","first-page":"1244","DOI":"10.1016\/j.imavis.2005.06.008","volume":"24","author":"Q. Zhou","year":"2006","unstructured":"Zhou, Q., Aggarwal, J.: Object tracking in an outdoor environment using fusion of features and cameras. Image Vis. Comput. 24(11), 1244\u20131255 (2006)","journal-title":"Image Vis. Comput."},{"key":"182_CR155","doi-asserted-by":"crossref","unstructured":"Zhou, Z.H.: Learning with unlabeled data and its application to image retrieval. In: The 9th Pacific Rim International Conference on Artificial Intelligence, pp. 5\u201310. Guilin (2006)","DOI":"10.1007\/978-3-540-36668-3_3"},{"key":"182_CR156","doi-asserted-by":"crossref","unstructured":"Zhu, Q., Yeh, M.C., Cheng, K.T.: Multimodal fusion using learned text concepts for image categorization. In: ACM International Conference on Multimedia, pp. 211\u2013220. Santa Barbara (2006)","DOI":"10.1145\/1180639.1180698"},{"key":"182_CR157","doi-asserted-by":"crossref","unstructured":"Zotkin, D.N., Duraiswami, R., Davis, L.S.: Joint audio-visual tracking using particle filters. EURASIP J. Appl. Signal Process. (11), 1154\u20131164 (2002)","DOI":"10.1155\/S1110865702206058"},{"key":"182_CR158","unstructured":"Zou, X., Bhanu, B.: Tracking humans using multimodal fusion. In: IEEE Conference on Computer Vision and Pattern Recognition, p. 4. Washington (2005)"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-010-0182-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s00530-010-0182-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-010-0182-0","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,19]],"date-time":"2025-02-19T20:54:39Z","timestamp":1739998479000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s00530-010-0182-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2010,4,4]]},"references-count":158,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2010,11]]}},"alternative-id":["182"],"URL":"https:\/\/doi.org\/10.1007\/s00530-010-0182-0","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2010,4,4]]}}}