{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,7]],"date-time":"2025-04-07T23:40:09Z","timestamp":1744069209311,"version":"3.40.3"},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2012,9,7]],"date-time":"2012-09-07T00:00:00Z","timestamp":1346976000000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Int J Multimed Info Retr"],"published-print":{"date-parts":[[2012,12]]},"DOI":"10.1007\/s13735-012-0020-6","type":"journal-article","created":{"date-parts":[[2012,9,6]],"date-time":"2012-09-06T06:54:16Z","timestamp":1346914456000},"page":"223-238","source":"Crossref","is-referenced-by-count":6,"title":["Video concept detection by audio-visual grouplets"],"prefix":"10.1007","volume":"1","author":[{"given":"Wei","family":"Jiang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alexander C.","family":"Loui","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2012,9,7]]},"reference":[{"key":"20_CR1","doi-asserted-by":"crossref","first-page":"147","DOI":"10.1016\/0923-5965(95)00003-F","volume":"7","author":"T Aach","year":"1995","unstructured":"Aach T, Kaup A (1995) Bayesian algorithms for adaptive change detection in image sequences using Markov random fields. Signal Process: Image Commun 7:147\u2013160","journal-title":"Signal Process: Image Commun"},{"key":"20_CR2","doi-asserted-by":"crossref","unstructured":"Akutsu A, Tonomura Y (1994) Video tomography: an efficient method for camerawork extraction and motion analysis. In: ACM multimedia, pp 349\u2013356","DOI":"10.1145\/192593.192697"},{"key":"20_CR3","doi-asserted-by":"crossref","unstructured":"Barzelay Z, Schechner Y (2007) Harmony in motion. In: IEEE CVPR, pp 1\u20138","DOI":"10.1109\/CVPR.2007.383344"},{"issue":"7","key":"20_CR4","doi-asserted-by":"crossref","first-page":"828","DOI":"10.1109\/TPAMI.2003.1206512","volume":"25","author":"MJ Beal","year":"2003","unstructured":"Beal MJ, Jojic N, Attias H (2003) A graphical model for audiovisual object tracking. IEEE PAMI 25(7):828\u2013836","journal-title":"IEEE PAMI"},{"key":"20_CR5","doi-asserted-by":"crossref","unstructured":"Chang S et al (2007) Large-scale multimodal semantic concept detection for consumer video. In: ACM MIR, pp 255\u2013264","DOI":"10.1145\/1290082.1290118"},{"key":"20_CR6","doi-asserted-by":"crossref","unstructured":"Cotton C, Ellis D, Loui A (2011) Soundtrack classification by transient events. In: IEEE ICASSP, Czech Republic","DOI":"10.1109\/ICASSP.2011.5946443"},{"issue":"2","key":"20_CR7","doi-asserted-by":"crossref","first-page":"257","DOI":"10.1109\/TMM.2006.886263","volume":"9","author":"M Cristani","year":"2007","unstructured":"Cristani M, Manuele B, Vittorio M (2007) Audio-visual event recognition in surveillance video sequences. IEEE Trans Multimedia 9(2):257\u2013267","journal-title":"IEEE Trans Multimedia"},{"key":"20_CR8","doi-asserted-by":"crossref","unstructured":"Dalal N, Triggs B (2005) Histograms of oriented gradients for human detection. In: IEEE CVPR, pp 886\u2013893","DOI":"10.1109\/CVPR.2005.177"},{"key":"20_CR9","doi-asserted-by":"crossref","unstructured":"Davis J et al (2007) Information-theoretic metric learning. In: ICML, pp 209\u2013216","DOI":"10.1145\/1273496.1273523"},{"key":"20_CR10","doi-asserted-by":"crossref","unstructured":"Ding M, Chen Y, Bressler SL (2006) Granger causality: basic theory and applications to neuroscience. In: Schelter S et al (eds) Handbook of time series analysis. Wiley, Wienheim","DOI":"10.1002\/9783527609970.ch17"},{"key":"20_CR11","doi-asserted-by":"crossref","unstructured":"Divvala S et al (2009) An empirical study of context in object detection. In: IEEE CVPR, Miami","DOI":"10.1109\/CVPRW.2009.5206532"},{"issue":"1","key":"20_CR12","doi-asserted-by":"crossref","first-page":"32","DOI":"10.2174\/1874479610801010032","volume":"1","author":"SY Elhabian","year":"2008","unstructured":"Elhabian SY, El-Sayed KM (2008) Moving object detection in spatial domain using background removal techniques: state-of-art. Recent Patents Comput Sci 1(1):32\u201354","journal-title":"Recent Patents Comput Sci"},{"key":"20_CR13","doi-asserted-by":"crossref","unstructured":"Enqvist O, Josephson K, Kahl F (2009) Optimal correspondences from pairwise constraints. In: IEEE ICCV, Kyoto","DOI":"10.1109\/ICCV.2009.5459319"},{"key":"20_CR14","unstructured":"Globerson A, Roweis S (2006) Metric learning by collapsing classes. In: NIPS, pp 451\u2013458"},{"issue":"3","key":"20_CR15","doi-asserted-by":"crossref","first-page":"424","DOI":"10.2307\/1912791","volume":"37","author":"C Granger","year":"1969","unstructured":"Granger C (1969) Investigating causal relations by econometric models and cross-spectral methods. Econometrica 37(3):424\u2013438","journal-title":"Econometrica"},{"key":"20_CR16","doi-asserted-by":"crossref","unstructured":"Iwano K et al (2007) Audio-visual speech recognition using lip information extracted from side-face images. EURASIP J ASMP 2007(1):4\u201312","DOI":"10.1155\/2007\/64506"},{"key":"20_CR17","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/1823746.1823748","volume":"6","author":"W Jiang","year":"2010","unstructured":"Jiang W et al (2010) Audio-visual atoms for generic video concept classification. ACM TOMCCAP 6:1\u201319","journal-title":"ACM TOMCCAP"},{"key":"20_CR18","doi-asserted-by":"crossref","unstructured":"Jiang Y, Ngo CW, Yang J (2007) Towards optimal bag-of-features for object categorization and semantic video retrieval. In: ACM CIVR, pp 494\u2013501","DOI":"10.1145\/1282280.1282352"},{"key":"20_CR19","doi-asserted-by":"crossref","unstructured":"Jiang Y et al (2011) Consumer video understanding: a benchmark database and an evaluation of human and machine performance. ACM ICMR, Trento","DOI":"10.1145\/1991996.1992025"},{"issue":"4","key":"20_CR20","doi-asserted-by":"crossref","first-page":"295","DOI":"10.1016\/0923-5965(95)00054-2","volume":"8","author":"P Joly","year":"1996","unstructured":"Joly P, Kim HK (1996) Efficient automatic analysis of camera work and microsegmentation of video using spatiotemporal images. Signal Process: Image Commun 8(4):295\u2013307","journal-title":"Signal Process: Image Commun"},{"key":"20_CR21","doi-asserted-by":"crossref","unstructured":"Ke Y, Sukthankar R, Hebert M (2007) Event detection in crowded videos. IEEE ICCV, Brazil","DOI":"10.1109\/ICCV.2007.4409011"},{"key":"20_CR22","doi-asserted-by":"crossref","DOI":"10.1109\/CVPR.2008.4587756","volume-title":"Learning realistic human actions from movies","author":"I Laptev","year":"2008","unstructured":"Laptev I et al (2008) Learning realistic human actions from movies. IEEE CVPR, Alaska"},{"key":"20_CR23","unstructured":"Lin WH, Hauptmann A (2002) News video classification using svm-based multimodal classifiers and combination strategies. In: Proc ACM multimedia, pp 323\u2013326"},{"issue":"2","key":"20_CR24","doi-asserted-by":"crossref","first-page":"91","DOI":"10.1023\/B:VISI.0000029664.99615.94","volume":"60","author":"DG Lowe","year":"2004","unstructured":"Lowe DG (2004) Distinctive image features from scale-invariant keypoints. IJCV 60(2):91\u2013110","journal-title":"IJCV"},{"key":"20_CR25","doi-asserted-by":"crossref","unstructured":"Marszalek M, Laptev I, Schmid C (2009) Actions in context. In: IEEE CVPR, Miami","DOI":"10.1109\/CVPRW.2009.5206557"},{"key":"20_CR26","unstructured":"Mihalcea R, Corley C, Strapparava C (2006) Corpus-based and knowledge-based measures of text semantic similarity. In: National conference on artificial intelligence, pp 775\u2013780"},{"issue":"1","key":"20_CR27","doi-asserted-by":"crossref","first-page":"55","DOI":"10.1007\/s10827-008-0126-2","volume":"27","author":"A Nedungadi","year":"2009","unstructured":"Nedungadi A et al (2009) Analyzing multiple spike trains with nonparametric granger causality. J Comput Neurosci 27(1):55\u201364","journal-title":"J Comput Neurosci"},{"key":"20_CR28","unstructured":"Ng A, Jordan M, Weiss Y (2001) On spectral clustering: analysis and an algorithm. NIPS"},{"issue":"9","key":"20_CR29","doi-asserted-by":"crossref","first-page":"1837","DOI":"10.1109\/78.942614","volume":"49","author":"DT Pham","year":"2001","unstructured":"Pham DT, Cardoso JF (2001) Blind separation of instantaneous mixtures of non stationary sources. IEEE Trans Signal Process 49(9):1837\u20131848","journal-title":"IEEE Trans Signal Process"},{"key":"20_CR30","doi-asserted-by":"crossref","unstructured":"Prabhakar K et al (2010) Temporal causality for the analysis of visual events. In: IEEE CVPR, San Francisco","DOI":"10.1109\/CVPR.2010.5539871"},{"key":"20_CR31","unstructured":"Roweis ST (2001) One microphone source separation. NIPS"},{"key":"20_CR32","doi-asserted-by":"crossref","unstructured":"Sargin M et al (2009) Audiovisual celebrity recognition in unconstrained web videos. In: IEEE ICASSP, Taipei","DOI":"10.1109\/ICASSP.2009.4959999"},{"key":"20_CR33","doi-asserted-by":"crossref","unstructured":"Sch\u00f6lkopf B, Smola AJ (2002) Learning with kernels: support vector machines, regularization, optimization, and beyond. MIT, Cambridge","DOI":"10.7551\/mitpress\/4175.001.0001"},{"key":"20_CR34","doi-asserted-by":"crossref","unstructured":"Smeaton AF, Over P, Kraaij W (2006) Evaluation campaigns and TRECVid. In: ACM MIR, pp 321\u2013330","DOI":"10.1145\/1178677.1178722"},{"issue":"8","key":"20_CR35","doi-asserted-by":"crossref","first-page":"747","DOI":"10.1109\/34.868677","volume":"22","author":"C Stauffer","year":"2000","unstructured":"Stauffer C, Grimson E (2000) Learning patterns of activity using realtime tracking. IEEE PAMI 22(8):747\u2013757","journal-title":"IEEE PAMI"},{"key":"20_CR36","doi-asserted-by":"crossref","unstructured":"Varma M, Babu BR (2009) More generality in efficient multiple kernel learning. In: ICML, pp 1065\u20131072","DOI":"10.1145\/1553374.1553510"},{"issue":"4","key":"20_CR37","doi-asserted-by":"crossref","first-page":"767","DOI":"10.1093\/biomet\/87.4.767","volume":"87","author":"A Walden","year":"2000","unstructured":"Walden A (2000) A unified view of multitaper multivariate spectral estimation. Biometrika 87(4):767\u2013788","journal-title":"Biometrika"},{"key":"20_CR38","unstructured":"Wang B, Plumbley MD (2006) Investigating single-channel audio source separation methods based on non-negative matrix factorization. In: ICArn, pp 17\u201320"},{"key":"20_CR39","doi-asserted-by":"crossref","unstructured":"Wang W et al (2005) Video assisted speech source separation. In: IEEE ICASSP, pp 425\u2013428","DOI":"10.1109\/ICASSP.2005.1416331"},{"issue":"12","key":"20_CR40","first-page":"207","volume":"10","author":"K Weinberger","year":"2009","unstructured":"Weinberger K, Saul L (2009) Distance metric learning for large margin nearest neighbor classification. JMLR 10(12):207\u2013244","journal-title":"JMLR"},{"issue":"2","key":"20_CR41","first-page":"286","volume":"11","author":"L Wu","year":"2009","unstructured":"Wu L et al (2009) Scale-invariant visual language modeling for object categorization. IEEE TMM 11(2):286\u2013294","journal-title":"IEEE TMM"},{"issue":"7","key":"20_CR42","first-page":"1908","volume":"19","author":"L Wu","year":"2010","unstructured":"Wu L et al (2010) Semantics-preserving bag-of-words models and applications. IEEE TIP 19(7):1908\u20131920","journal-title":"IEEE TIP"},{"key":"20_CR43","unstructured":"Wu Y et al (2004) Multimodal information fusion for video concept detection. IEEE ICIP, pp 2391\u20132394"},{"key":"20_CR44","doi-asserted-by":"crossref","unstructured":"Yang J et al (2007) Evaluating bag-of-visual-words representations in scene classification. ACM MIR, pp 197\u2013206","DOI":"10.1145\/1290082.1290111"},{"key":"20_CR45","volume-title":"Grouplet: a structured image representation for recognizing human and object interactions","author":"B Yao","year":"2010","unstructured":"Yao B, Fei-Fei L (2010) Grouplet: a structured image representation for recognizing human and object interactions. IEEE CVPR, San Francisco"}],"container-title":["International Journal of Multimedia Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-012-0020-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s13735-012-0020-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-012-0020-6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,7]],"date-time":"2025-04-07T23:05:01Z","timestamp":1744067101000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s13735-012-0020-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012,9,7]]},"references-count":45,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2012,12]]}},"alternative-id":["20"],"URL":"https:\/\/doi.org\/10.1007\/s13735-012-0020-6","relation":{},"ISSN":["2192-6611","2192-662X"],"issn-type":[{"type":"print","value":"2192-6611"},{"type":"electronic","value":"2192-662X"}],"subject":[],"published":{"date-parts":[[2012,9,7]]}}}