{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T18:18:18Z","timestamp":1775326698161,"version":"3.50.1"},"reference-count":174,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2012,11,13]],"date-time":"2012-11-13T00:00:00Z","timestamp":1352764800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Int J Multimed Info Retr"],"published-print":{"date-parts":[[2013,6]]},"DOI":"10.1007\/s13735-012-0024-2","type":"journal-article","created":{"date-parts":[[2012,11,13]],"date-time":"2012-11-13T09:54:34Z","timestamp":1352800474000},"page":"73-101","source":"Crossref","is-referenced-by-count":118,"title":["High-level event recognition in unconstrained videos"],"prefix":"10.1007","volume":"2","author":[{"given":"Yu-Gang","family":"Jiang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Subhabrata","family":"Bhattacharya","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shih-Fu","family":"Chang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mubarak","family":"Shah","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2012,11,13]]},"reference":[{"issue":"3","key":"24_CR1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/1922649.1922653","volume":"43","author":"JK Aggarwal","year":"2011","unstructured":"Aggarwal JK, Ryoo MS (2011) Human activity analysis: a review. ACM Comput Surv 43(3):1\u201316","journal-title":"ACM Comput Surv"},{"issue":"2","key":"24_CR2","doi-asserted-by":"crossref","first-page":"288","DOI":"10.1109\/TPAMI.2008.284","volume":"32","author":"S Ali","year":"2010","unstructured":"Ali S, Shah M (2010) Human action recognition in videos using kinematic features and multiple instance learning. IEEE Trans Pattern Anal Mach Intell 32(2):288\u2013303","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"11","key":"24_CR3","doi-asserted-by":"crossref","first-page":"832","DOI":"10.1145\/182.358434","volume":"26","author":"JF Allen","year":"1983","unstructured":"Allen JF (1983) Maintaining knowledge about temporal intervals. Commun ACM 26(11):832\u2013843","journal-title":"Commun ACM"},{"issue":"9","key":"24_CR4","doi-asserted-by":"crossref","first-page":"2318","DOI":"10.1523\/JNEUROSCI.05-09-02318.1985","volume":"5","author":"CG Atkeson","year":"1985","unstructured":"Atkeson CG, Hollerbach JM (1985) Kinematic features of unrestrained vertical arm movements. J Neurosci 5(9):2318\u20132330","journal-title":"J Neurosci"},{"issue":"2","key":"24_CR5","doi-asserted-by":"crossref","first-page":"881","DOI":"10.1121\/1.2750160","volume":"122","author":"JJ Aucouturier","year":"2007","unstructured":"Aucouturier JJ, Defreville B, Pachet F (2007) The bag-of-frames approach to audio pattern recognition: a sufficient model for urban soundscapes but not for polyphonic music. J Acoust Soc Am 122(2):881\u2013891","journal-title":"J Acoust Soc Am"},{"key":"24_CR6","unstructured":"Aytar Y, Shah M, Luo J (2008) Utilizing semantic word similarity measures for video retrieval. In: Proceedings of IEEE conference on computer vision and pattern recognition, Providence, USA"},{"key":"24_CR7","doi-asserted-by":"crossref","unstructured":"Baillie M, Jose JM (2003) Audio-based event detection for sports video. In: Proceedings of international conference on image and video retrieval, Urbana-Champaign, IL","DOI":"10.1007\/3-540-45113-7_30"},{"issue":"1","key":"24_CR8","doi-asserted-by":"crossref","first-page":"279","DOI":"10.1007\/s11042-010-0643-7","volume":"51","author":"L Ballan","year":"2011","unstructured":"Ballan L, Bertini M, Bimbo AD, Seidenari L, Serra G (2011) Event detection and recognition for semantic annotation of video. Multimedia Tools Appl 51(1):279\u2013302","journal-title":"Multimedia Tools Appl"},{"key":"24_CR9","doi-asserted-by":"crossref","unstructured":"Banko M, Mittal VO, Witbrock, MJ (2000) Headline generation based on statistical translation. In: Proceedings of the annual meeting of the association for computational linguistics, Hong Kong","DOI":"10.3115\/1075218.1075259"},{"key":"24_CR10","unstructured":"Bao L, Yu SI, Lan ZZ, Overwijk A, Jin Q, Langner B, Garbus M, Burger S, Metze F, Hauptmann A (2011) Informedia @ TRECVID 2011. In: Proceedings of NIST TRECVID, Workshop, Gaithersburg, MD, USA"},{"key":"24_CR11","unstructured":"Barbu, A., Bridge, A., Coroian, D., Dickinson, S., Mussman, S., Narayanaswamy, S., Salvi, D., Schmidt, L., Shangguan, J., Siskind, J.M., Waggoner, J., Wang, S., Wei, J., Yin, Y., Zhang, Z.: Large-scale automatic labeling of video events with verbs based on event-participant interaction. In: arXiv:1204.3616v1 (2012)"},{"issue":"3","key":"24_CR12","doi-asserted-by":"crossref","first-page":"346","DOI":"10.1016\/j.cviu.2007.09.014","volume":"110","author":"H Bay","year":"2008","unstructured":"Bay H, Ess A, Tuytelaars T, van Gool L (2008) SURF: speeded up robust features. Comput Vision Image Underst 110(3):346\u2013359","journal-title":"Comput Vision Image Underst"},{"issue":"7","key":"24_CR13","doi-asserted-by":"crossref","first-page":"828","DOI":"10.1109\/TPAMI.2003.1206512","volume":"25","author":"MJ Beal","year":"2003","unstructured":"Beal MJ, Jojic N, Attias H (2003) A graphical model for audiovisual object tracking. IEEE Trans Pattern Anal Mach Intell 25(7):828\u2013836","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"24_CR14","doi-asserted-by":"crossref","unstructured":"Blank M, Gorelick L, Shechtman E, Irani M, Basri R (2005) Actions as space-time shapes. In: Proceedings of International Conference on Computer Vision","DOI":"10.1109\/ICCV.2005.28"},{"key":"24_CR15","doi-asserted-by":"crossref","first-page":"1257","DOI":"10.1098\/rstb.1997.0108","volume":"352","author":"AF Bobick","year":"1997","unstructured":"Bobick AF (1997) Movement, activity, and action: the role of knowledge in the perception of motion. Philos Trans Royal Soc London 352:1257\u20131265","journal-title":"Philos Trans Royal Soc London"},{"key":"24_CR16","doi-asserted-by":"crossref","unstructured":"Boiman O, Shechtman E, Irani M (2008) In defense of nearest-neighbor based image classification. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR.2008.4587598"},{"issue":"3","key":"24_CR17","doi-asserted-by":"crossref","first-page":"416","DOI":"10.1109\/TSMCC.2008.919173","volume":"38","author":"D Brezeale","year":"2008","unstructured":"Brezeale D, Cook D (2008) Automatic video classification: a survey of the literature. IEEE Trans Syst Man Cybernet Part C 38(3):416\u2013430","journal-title":"IEEE Trans Syst Man Cybernet Part C"},{"issue":"3","key":"24_CR18","first-page":"663","volume":"12","author":"C Campos de","year":"2011","unstructured":"de Campos C, Ji Q (2011) Efficient structure learning of bayesian networks using constraints. J Mach Learn Res 12(3):663\u2013689","journal-title":"J Mach Learn Res"},{"key":"24_CR19","doi-asserted-by":"crossref","unstructured":"Cao J, Zhang YD, Song YC, Chen ZN, Zhang X, Li JT (2009) MCG-WEBV: a benchmark dataset for web video analysis. Tech. rep., ICT-MCG-09-001, Institute of Computing Technology, Chinese Academy of Sciences","DOI":"10.1145\/1646396.1646458"},{"key":"24_CR20","unstructured":"Castel C, Chaudron L, Tessier C (1996) What is going on? a high level interpretation of sequences of images. In: Proceedings of European conference on computer vision, Springer-Verlag, London, UK"},{"key":"24_CR21","unstructured":"Chang SF, He J, Jiang YG, El Khoury E, Ngo CW, Yanagawa A, Zavesky, E. (2008) Columbia University\/VIREO-CityU\/IRIT TRECVID2008 high-level feature extraction and interactive video search. In: Proceedings of NIST TRECVID, Workshop, Gaithersburg"},{"key":"24_CR22","doi-asserted-by":"crossref","unstructured":"Chang YL, Zeng W, Kamel I, Alonso R (1996) Integrated image and speech analysis for content-based video indexing. In: Proceedings of IEEE international conference on multimedia computing and systems, Washington, DC","DOI":"10.1109\/MMCS.1996.534992"},{"key":"24_CR23","unstructured":"Chen M, Xu ZE, Weinberger KQ, Sha F (2012) Marginalized stacked denoising autoencoders for domain adaptation. In: Proceedings international conference on machine learning"},{"key":"24_CR24","unstructured":"Cheng H et al (2011) Team SRI-Sarnoff\u2019s AURORA System @ TRECVID 2011. In: Proceedings of NIST TRECVID, Workshop"},{"key":"24_CR25","doi-asserted-by":"crossref","unstructured":"Connolly CI (2007) Learning to recognize complex actions using conditional random fields. In: Proceedings of International Conference on Advances in Visual Computing","DOI":"10.1007\/978-3-540-76856-2_33"},{"key":"24_CR26","doi-asserted-by":"crossref","unstructured":"Cotton CV, Ellis DPW, Loui AC (2011) Soundtrack classification by transient events. In: Proceedings of IEEE international conference acoustics, speech, signal processing, pp 473\u2013476","DOI":"10.1109\/ICASSP.2011.5946443"},{"key":"24_CR27","doi-asserted-by":"crossref","unstructured":"Dalal N, Triggs B (2005) Histogram of oriented gradients for human detection. In: Proceedings of IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2005.177"},{"key":"24_CR28","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, Li LJ, Li K, Fei-Fei L (2009) Imagenet: a large-scale hierarchical image database. In: Proceedings of IEEE conference on computer vision and, pattern recognition","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"24_CR29","doi-asserted-by":"crossref","unstructured":"Dollar P, Rabaud V, Cottrell G, Belongie S (2005) Behavior recognition via sparse spatio-temporal features. In Proceedings of joint IEEE international workshop on visual surveillance and performance evaluation of tracking and surveillance","DOI":"10.1109\/VSPETS.2005.1570899"},{"key":"24_CR30","unstructured":"Dorko G (2012) Interest point detectors local descriptors. http:\/\/lear.inrialpes.fr\/people\/dorko\/downloads.html"},{"key":"24_CR31","doi-asserted-by":"crossref","unstructured":"Duan L, Xu D, Tsang IW, Luo J (2010) Visual event recognition in videos by learning from web data. In: Proceedings of IEEE conference on computer vision and, pattern recognition","DOI":"10.1109\/CVPR.2010.5539870"},{"key":"24_CR32","doi-asserted-by":"crossref","unstructured":"Duchenne O, Laptev I, Sivic J, Bach F, Ponce J (2009) Automatic annotation of human actions in video. In: Proceedings of IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2009.5459279"},{"issue":"1","key":"24_CR33","doi-asserted-by":"crossref","first-page":"321","DOI":"10.1109\/TSA.2005.854103","volume":"14","author":"A Eronen","year":"2006","unstructured":"Eronen A, Peltonen V, Tuomi J, Klapuri A, Fagerlund S, Sorsa T, Lorho G, Huopaniemi J (2006) Audio-based context recognition. IEEE Trans Audio Speech Lang Process 14(1):321\u2013329","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"24_CR34","unstructured":"Everingham M, van Gool L, Williams CKI, Winn J, Zisserman A (2007) The PASCAL visual object classes challenge 2007 (VOC2007) Results. http:\/\/pascallin.ecs.soton.ac.uk\/challenges\/VOC\/voc2007\/ results\/index.shtml"},{"issue":"9","key":"24_CR35","doi-asserted-by":"crossref","first-page":"1530","DOI":"10.1109\/TPAMI.2009.167","volume":"32","author":"P Felzenszwalb","year":"2010","unstructured":"Felzenszwalb P, Girshick R, McAllester D, Ramanan D (2010) Object detection with discriminatively trained part based models. IEEE Trans Pattern Anal Mach Intell 32(9):1530\u20131535","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"24_CR36","unstructured":"Feng Y, Lapata M (2010) How many words is a picture worth? automatic caption generation for news images. In: Proceedings of the annual meeting of the association for computational linguistics"},{"key":"24_CR37","unstructured":"Fillmore CJ (1968) The case for case. In: Bach E, Harms R (eds), Universals in Linguistic Theory, New York, pp 1\u201388"},{"key":"24_CR38","unstructured":"Fiscus J et al (2011) TRECVID multimedia event detection evaluation plan. http:\/\/www.nist.gov\/itl\/iad\/mig\/upload\/MED11-EvalPlan-V03-20110801a.pdf"},{"issue":"4","key":"24_CR39","doi-asserted-by":"crossref","first-page":"76","DOI":"10.1109\/MMUL.2005.87","volume":"12","author":"ARJ Francois","year":"2005","unstructured":"Francois ARJ, Nevatia R, Hobbs J, Bolles RC (2005) Verl: an ontology framework for representing and annotating video events. IEEE Multimedia Magazine 12(4):76\u201386","journal-title":"IEEE Multimedia Magazine"},{"issue":"9","key":"24_CR40","doi-asserted-by":"crossref","first-page":"1392","DOI":"10.1109\/TPAMI.2005.169","volume":"27","author":"BJ Frey","year":"2005","unstructured":"Frey BJ, Jojic N (2005) A comparison of algorithms for inference and learning in probabilistic graphical models. IEEE Trans Pattern Anal Mach Intell 27(9):1392\u20131416","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"7","key":"24_CR41","doi-asserted-by":"crossref","first-page":"1271","DOI":"10.1109\/TPAMI.2009.132","volume":"32","author":"JC Gemert van","year":"2010","unstructured":"van Gemert JC, Veenman CJ, Smeulders AWM, Geusebroek JM (2010) Visual word ambiguity. IEEE Trans Pattern Anal Mach Intell 32(7):1271\u20131283","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"24_CR42","doi-asserted-by":"crossref","unstructured":"Ghanem N, DeMenthon D, Doermann D, Davis L (2004) Representation and recognition of events in surveillance video using petri nets. In: Proceedings of IEEE conference on computer vision and pattern recognition workshop","DOI":"10.1109\/CVPR.2004.430"},{"issue":"3","key":"24_CR43","doi-asserted-by":"crossref","first-page":"424","DOI":"10.2307\/1912791","volume":"37","author":"C Granger","year":"1969","unstructured":"Granger C (1969) Investigating causal relations by econometric models and cross-spectral methods. Econometrica 37(3):424\u2013438","journal-title":"Econometrica"},{"key":"24_CR44","unstructured":"Hakeem A, Sheikh Y, Shah M (2004) Casee: a hierarchical event representation for the analysis of videos. In: Proceedings of AAAI conference"},{"key":"24_CR45","doi-asserted-by":"crossref","DOI":"10.7551\/mitpress\/4170.001.0001","volume-title":"Learning Kernel classifiers: theory and algorithms","author":"R Herbrich","year":"2001","unstructured":"Herbrich R (2001) Learning Kernel classifiers: theory and algorithms. The MIT Press, Cambridge"},{"key":"24_CR46","unstructured":"Hu Y, Cao L, Lv F, Yan S, Gong Y, Huang TS (2009) Action detection in complex scenes with spatial and temporal ambiguities. In: Proceedings of IEEE international conference on computer vision"},{"issue":"4","key":"24_CR47","doi-asserted-by":"crossref","first-page":"749","DOI":"10.1109\/TMM.2006.876289","volume":"8","author":"CL Huang","year":"2006","unstructured":"Huang CL, Shih HC, Chao CY (2006) Semantic analysis of soccer video using dynamic bayesian network. IEEE Trans Multimedia 8(4):749\u2013760","journal-title":"IEEE Trans Multimedia"},{"key":"24_CR48","unstructured":"Inoue N, Kamishima Y, Wada T, Shinoda K, Sato S (2011) TokyoTech+Canon at TRECVID 2011. In: Proceedings of NIST TRECVID Workshop"},{"issue":"3","key":"24_CR49","doi-asserted-by":"crossref","first-page":"414","DOI":"10.1006\/cviu.2000.0896","volume":"81","author":"SS Intille","year":"2001","unstructured":"Intille SS, Bobick AF (2001) Recognizing planned, multiperson action. Comput Vision Image Underst 81(3):414\u2013445","journal-title":"Comput Vision Image Underst"},{"issue":"8","key":"24_CR50","doi-asserted-by":"crossref","first-page":"852","DOI":"10.1109\/34.868686","volume":"22","author":"YA Ivanov","year":"2000","unstructured":"Ivanov YA, Bobick AF (2000) Recognition of visual activities and interactions by stochastic parsing. IEEE Trans Pattern Anal Mach Intell 22(8):852\u2013872","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"24_CR51","doi-asserted-by":"crossref","unstructured":"Jiang W, Cotton C, Chang SF, Ellis D, Loui AC (2009) Short-term audio-visual atoms for generic video concept classification. In: Proceedings of ACM international conference on multimedia","DOI":"10.1145\/1631272.1631277"},{"key":"24_CR52","doi-asserted-by":"crossref","unstructured":"Jiang W, Loui AC (2011) Audio-visual grouplet: Temporal audio-visual interactions for general video concept classification. In: Proceedings of ACM international conference on multimedia","DOI":"10.1145\/2072298.2072316"},{"key":"24_CR53","doi-asserted-by":"crossref","unstructured":"Jiang YG (2012) SUPER: Towards real-time event recognition in Internet videos. In: Proceedings of ACM international conference on multimedia retrieval","DOI":"10.1145\/2324796.2324805"},{"key":"24_CR54","doi-asserted-by":"crossref","unstructured":"Jiang YG, Dai Q, Xue X, Liu W, Ngo CW (2012) Trajectory-based modeling of human actions with motion reference points. In: Proceedings of European conference on computer vision","DOI":"10.1007\/978-3-642-33715-4_31"},{"key":"24_CR55","doi-asserted-by":"crossref","unstructured":"Jiang YG, Ngo CW, Yang J (2007) Towards optimal bag-of-features for object categorization and semantic video retrieval. In: Proceedings of ACM international conference on image and video retrieval","DOI":"10.1145\/1282280.1282352"},{"issue":"1","key":"24_CR56","doi-asserted-by":"crossref","first-page":"42","DOI":"10.1109\/TMM.2009.2036235","volume":"12","author":"YG Jiang","year":"2010","unstructured":"Jiang YG, Yang J, Ngo CW, Hauptmann AG (2010) Representations of keypoint-based semantic concept detection: a comprehensive study. IEEE Trans Multimedia 12(1):42\u201353","journal-title":"IEEE Trans Multimedia"},{"key":"24_CR57","doi-asserted-by":"crossref","unstructured":"Jiang YG, Ye G, Chang SF, Ellis D, Loui AC (2011) Consumer video understanding: a bechmark database and an evaluation of human and machine performance. In: Proceedings of ACM international conference on multimedia retrieval","DOI":"10.1145\/1991996.1992025"},{"key":"24_CR58","unstructured":"Jiang YG, Zeng X, Ye G, Bhattacharya S, Ellis D, Shah M, Chang SF (2010) Columbia-UCF TRECVID2010 multimedia event detection: Combining multiple modalities, contextual concepts, and temporal matching. In: Proceedings of NIST TRECVID, Workshop"},{"key":"24_CR59","unstructured":"Joo SW, Chellappa R (2006) Attribute grammar-based event recognition and anomaly detection. In: Proceedings of IEEE conference on computer vision and pattern recognition, Workshop"},{"key":"24_CR60","unstructured":"Ke Y, Sukthankar R (2004) PCA-SIFT: a more distinctive representation for local image descriptors. In: Proceedings of IEEE conference on computer vision and pattern recognition"},{"key":"24_CR61","doi-asserted-by":"crossref","unstructured":"Klaser A, Marszalek M, Schmid C (2008) A spatio-temporal descriptor based on 3d-gradients. In: Proceedings of British machine vision conference","DOI":"10.5244\/C.22.99"},{"key":"24_CR62","doi-asserted-by":"crossref","unstructured":"Knopp J, Prasad M, Willems G, Timofte R, van Gool L (2010) Hough transform and 3D SURF for robust three dimensional classification. In: Proceedings of European conference on computer vision","DOI":"10.1007\/978-3-642-15567-3_43"},{"issue":"2","key":"24_CR63","doi-asserted-by":"crossref","first-page":"171","DOI":"10.1023\/A:1020346032608","volume":"50","author":"A Kojima","year":"2002","unstructured":"Kojima A, Tamura T, Fukunaga K (2002) Natural language description of human activities from video images based on concept hierarchy of actions. Int J Comput Vision 50(2):171\u2013184","journal-title":"Int J Comput Vision"},{"key":"24_CR64","doi-asserted-by":"crossref","unstructured":"Kuehne H, Jhuang H, Garrote E, Poggio T, Serre T (2011) HMDB: a large video database for human motion recognition. In: Proceedings of IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"24_CR65","doi-asserted-by":"crossref","first-page":"107","DOI":"10.1007\/s11263-005-1838-7","volume":"64","author":"I Laptev","year":"2005","unstructured":"Laptev I (2005) On space-time interest points. Int J Comput Vision 64:107\u2013123","journal-title":"Int J Comput Vision"},{"key":"24_CR66","doi-asserted-by":"crossref","unstructured":"Laptev I, Marszalek M, Schmid C, Rozenfeld B (2008) Learning realistic human actions from movies. In: Proceedings of IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2008.4587756"},{"issue":"5","key":"24_CR67","doi-asserted-by":"crossref","first-page":"489","DOI":"10.1109\/TSMCC.2009.2023380","volume":"39","author":"G Lavee","year":"2009","unstructured":"Lavee G, Rivlin E, Rudzsky M (2009) Understanding video events: a survey of methods for automatic interpretation of semantic occurrences in videos. IEEE Trans Syst Man Cybernet Part C 39(5):489\u2013504","journal-title":"IEEE Trans Syst Man Cybernet Part C"},{"key":"24_CR68","doi-asserted-by":"crossref","unstructured":"Lazebnik S, Schmid C, Ponce J (2006) Beyond bags of features: spatial pyramid matching for recognizing natural scene categories. In: Proceedings of IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2006.68"},{"key":"24_CR69","doi-asserted-by":"crossref","unstructured":"Le QV, Zou WY, Yeung SY, Ng AY (2011) Learning hierarchical invariant spatio-temporal features for action recognition with independent subspace analysis. In: Proceedings of IEEE conference on computer vision and, pattern recognition","DOI":"10.1109\/CVPR.2011.5995496"},{"issue":"6","key":"24_CR70","doi-asserted-by":"crossref","first-page":"1406","DOI":"10.1109\/TASL.2009.2034776","volume":"18","author":"K Lee","year":"2010","unstructured":"Lee K, Ellis DPW (2010) Audio-based semantic concept classification for consumer video. IEEE Trans Audio Speech Lang Process 18(6):1406\u20131416","journal-title":"IEEE Trans Audio Speech Lang Process"},{"issue":"11","key":"24_CR71","doi-asserted-by":"crossref","first-page":"1499","DOI":"10.1109\/TCSVT.2008.2005597","volume":"18","author":"W Li","year":"2008","unstructured":"Li W, Zhang Z, Liu Z (2008) Expandable data-driven graphical modeling of human actions based on salient postures. IEEE Trans Circ Syst Video Technol 18(11):1499\u20131510","journal-title":"IEEE Trans Circ Syst Video Technol"},{"key":"24_CR72","doi-asserted-by":"crossref","first-page":"79","DOI":"10.1023\/A:1008045108935","volume":"30","author":"T Lindeberg","year":"1998","unstructured":"Lindeberg T (1998) Feature detection with automatic scale selection. Int J Comput Vision 30:79\u2013116","journal-title":"Int J Comput Vision"},{"key":"24_CR73","doi-asserted-by":"crossref","unstructured":"Liu J, Kuipers B, Savarese S (2011) Recognizing human actions by attributes. In: Proceedings of IEEE conference on computer vision and, pattern recognition, pp 3337\u20133344","DOI":"10.1109\/CVPR.2011.5995353"},{"key":"24_CR74","doi-asserted-by":"crossref","unstructured":"Liu J, Luo J, Shah M (2009) Recognizing realistic actions from videos \u201cin the wild\u201d. In: Proceedings of IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2009.5206744"},{"key":"24_CR75","unstructured":"Liu J, Shah M (2008) Learning human actions via information maximization. In: Proceedings of IEEE conference on computer vision and pattern recognition"},{"key":"24_CR76","doi-asserted-by":"crossref","unstructured":"Loui AC, Luo J, Chang SF, Ellis D, Jiang W, Kennedy L, Lee K, Yanagawa A (2007) Kodak\u2019s consumer video benchmark data set: concept definition and annotation. In: Proceedings of ACM international workshop on multimedia, information retrieval","DOI":"10.1145\/1290082.1290117"},{"key":"24_CR77","doi-asserted-by":"crossref","first-page":"91","DOI":"10.1023\/B:VISI.0000029664.99615.94","volume":"60","author":"D Lowe","year":"2004","unstructured":"Lowe D (2004) Distinctive image features from scale-invariant keypoints. Int J Comput Vision 60:91\u2013110","journal-title":"Int J Comput Vision"},{"issue":"1","key":"24_CR78","doi-asserted-by":"crossref","first-page":"74","DOI":"10.1109\/TMM.2007.911304","volume":"10","author":"L Lu","year":"2008","unstructured":"Lu L, Hanjalic A (2008) Audio keywords discovery for text-like audio content analysis and retrieval. IEEE Trans Multimedia 10(1):74\u201385","journal-title":"IEEE Trans Multimedia"},{"key":"24_CR79","unstructured":"Lucas BD, Kanade T (1981) An iterative image registration technique with an application to stereo vision. In: Proceedings of international joint conference on artificial intelligence"},{"issue":"9","key":"24_CR80","doi-asserted-by":"crossref","first-page":"2390","DOI":"10.1162\/NECO_a_00011","volume":"22","author":"RF Lyon","year":"2010","unstructured":"Lyon RF, Rehn M, Bengio S, Walters TC, Chechik G (2010) Sound retrieval and ranking using sparse auditory representations. Neural Comput 22(9):2390\u20132416","journal-title":"Neural Comput"},{"key":"24_CR81","doi-asserted-by":"crossref","unstructured":"Maji S, Berg AC, Malik J (2008) Classification using intersection kernel support vector machines is efficient. In: Proceedings of IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2008.4587630"},{"key":"24_CR82","unstructured":"Mandel MI, Ellis DPW (2005) Song-level features and support vector machines for music classification. In: Proceedings of international society of music information retrieval conference"},{"issue":"8","key":"24_CR83","doi-asserted-by":"crossref","first-page":"837","DOI":"10.1109\/34.531803","volume":"18","author":"BS Manjunath","year":"1996","unstructured":"Manjunath BS, Ma WY (1996) Texture features for browsing and retrieval of image data. IEEE Trans Pattern Anal Mach Intell 18(8):837\u2013842","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"24_CR84","doi-asserted-by":"crossref","unstructured":"Martin A, Doddington G, Kamm T, Ordowski M, Przybocki M (1997) The det curve in assessment of detection task performance. In: Procedings of European conference on speech communication and technology, pp 1895\u20131898","DOI":"10.21437\/Eurospeech.1997-504"},{"key":"24_CR85","doi-asserted-by":"crossref","unstructured":"Matas J, Chum O, Urban M, Pajdla T (2002) Robust wide baseline stereo from maximally stable extremal regions. In: Proceedings of British machine vision conference, vol 1, pp 384\u2013393","DOI":"10.5244\/C.16.36"},{"key":"24_CR86","unstructured":"MediaEval: Multimedia retrieval benchmark evaluation. http:\/\/www.multimediaeval.org"},{"key":"24_CR87","doi-asserted-by":"crossref","unstructured":"Messing R, Pal C, Kautz H (2009) Activity recognition using the velocity histories of tracked keypoints. In: Proceedings of IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2009.5459154"},{"key":"24_CR88","doi-asserted-by":"crossref","first-page":"63","DOI":"10.1023\/B:VISI.0000027790.02288.f2","volume":"60","author":"K Mikolajczyk","year":"2004","unstructured":"Mikolajczyk K, Schmid C (2004) Scale and affine invariant interest point detectors. Int J Comput Vision 60:63\u201386","journal-title":"Int J Comput Vision"},{"issue":"10","key":"24_CR89","doi-asserted-by":"crossref","first-page":"1615","DOI":"10.1109\/TPAMI.2005.188","volume":"27","author":"K Mikolajczyk","year":"2005","unstructured":"Mikolajczyk K, Schmid C (2005) A performance evaluation of local descriptors. IEEE Trans Pattern Anal Mach Intell 27(10):1615\u20131630","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"1\/2","key":"24_CR90","doi-asserted-by":"crossref","first-page":"43","DOI":"10.1007\/s11263-005-3848-x","volume":"65","author":"K Mikolajczyk","year":"2005","unstructured":"Mikolajczyk K, Tuytelaars T, Schmid C, Zisserman A, Matas J et al (2005) A comparison of affine region detectors. Int J Comput Vision 65(1\/2):43\u201372","journal-title":"Int J Comput Vision"},{"key":"24_CR91","doi-asserted-by":"crossref","first-page":"17","DOI":"10.1109\/93.713301","volume":"5","author":"K Minami","year":"1998","unstructured":"Minami K, Akutsu A, Hamada H, Tonomura Y (1998) Video handling with music and speech detection. IEEE Multimedia Magazine 5:17\u201325","journal-title":"IEEE Multimedia Magazine"},{"key":"24_CR92","unstructured":"Moore D, Essa I (2001) Recognizing multitasked activities using stochastic context-free grammar. In: Proceedings of AAAI conference"},{"issue":"9","key":"24_CR93","doi-asserted-by":"crossref","first-page":"1632","DOI":"10.1109\/TPAMI.2007.70822","volume":"30","author":"F Moosmann","year":"2008","unstructured":"Moosmann F, Nowak E, Jurie F (2008) Randomized clustering forests for image classification. IEEE Trans Pattern Anal Mach Intell 30(9):1632\u20131646","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"24_CR94","unstructured":"Morsillo N, Mann G, Pal C (2010) Youtube scale, large vocabulary video annotation, Chapter 14 in video search and mining. Springer-Verlag series on studies in computational intelligence. Springer, Berlin, pp 357\u2013386"},{"issue":"3","key":"24_CR95","doi-asserted-by":"crossref","first-page":"86","DOI":"10.1109\/MMUL.2006.63","volume":"13","author":"M Naphade","year":"2006","unstructured":"Naphade M, Smith J, Tesic J, Chang SF, Hsu W, Kennedy L, Hauptmann A, Curtis J (2006) Large-scale concept ontology for multimedia. IEEE Multimedia Magazine 13(3):86\u201391","journal-title":"IEEE Multimedia Magazine"},{"key":"24_CR96","unstructured":"Natarajan P et al (2011) BBN VISER TRECVID 2011 multimedia event detection system. In: Proceedings of NIST TRECVID, Workshop"},{"key":"24_CR97","doi-asserted-by":"crossref","unstructured":"Natarajan P, Nevatia R (2008) Online, real-time tracking and recognition of human actions. In: Proceedings of IEEE workshop on motion and video, computing, pp 1\u20138","DOI":"10.1109\/WMVC.2008.4544064"},{"key":"24_CR98","unstructured":"Natsev A, Smith JR, Hill M, Hua G, Huang B, Merler M, Xie L, Ouyang H, Zhou, M (2010) IBM Research TRECVID-2010 video copy detection and multimedia event detection system. In: Proceedings of NIST TRECVID, Workshop"},{"key":"24_CR99","unstructured":"NIST Trecvid Multimedia Event Detection (MED) task. http:\/\/www.nist.gov\/itl\/iad\/mig\/med.cfm"},{"key":"24_CR100","doi-asserted-by":"crossref","unstructured":"Nister D, Stewenius H (2006) Scalable recognition with a vocabulary tree. In: Proceedings of IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2006.264"},{"key":"24_CR101","doi-asserted-by":"crossref","unstructured":"Nowak E, Jurie F, Triggs B (2006) Sampling strategies for bag-of-features image classification. In: Proceedings of European conference on computer vision","DOI":"10.1007\/11744085_38"},{"issue":"4","key":"24_CR102","doi-asserted-by":"crossref","first-page":"1126","DOI":"10.1109\/TIP.2010.2076821","volume":"20","author":"A Oikonomopoulos","year":"2011","unstructured":"Oikonomopoulos A, Patras I, Pantic M (2011) Spatiotemporal localization and categorization of human actions in unsegmented image sequences. IEEE Trans Image Process 20(4):1126\u20131140","journal-title":"IEEE Trans Image Process"},{"issue":"7","key":"24_CR103","doi-asserted-by":"crossref","first-page":"971","DOI":"10.1109\/TPAMI.2002.1017623","volume":"24","author":"T Ojala","year":"2002","unstructured":"Ojala T, Pietikainen M, Maenpaa T (2002) Multiresolution gray-scale and rotation invariant texture classification with local binary patterns. IEEE Trans Pattern Anal Mach Intell 24(7):971\u2013987","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"24_CR104","doi-asserted-by":"crossref","first-page":"145","DOI":"10.1023\/A:1011139631724","volume":"42","author":"A Oliva","year":"2001","unstructured":"Oliva A, Torralba A (2001) Modeling the shape of the scene: a holistic representation of the spatial envelope. Int J Comput Vision 42:145\u2013175","journal-title":"Int J Comput Vision"},{"key":"24_CR105","unstructured":"Ordonez V, Kulkarni G, Berg TL (2011) Im2Text: describing images using 1 million captioned photographs. In: Proceedings of advances in neural information processing systems"},{"key":"24_CR106","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu WJ (2002) Bleu: a method for automatic evaluation of machine translation. In: Proceeedings of the annual meeting of the association for computational linguistics","DOI":"10.3115\/1073083.1073135"},{"key":"24_CR107","doi-asserted-by":"crossref","unstructured":"Patterson RD, Robinson K, Holdsworth J, McKeown D, Zhang C, Allerhand M (1992) Complex sounds and auditory images. In: Proceedings of international symposium on hearing, pp 429\u2013446","DOI":"10.1016\/B978-0-08-041847-6.50054-X"},{"key":"24_CR108","doi-asserted-by":"crossref","unstructured":"Perronnin F, Sanchez J, Mensink T (2010) Improving the fisher kernel for large-scale image classification. In: Proceedings of European conference on computer vision","DOI":"10.1007\/978-3-642-15561-1_11"},{"key":"24_CR109","doi-asserted-by":"crossref","unstructured":"Philbin J, Chum O, Isard M, Sivic J, Zisserman A (2008) Lost in quantization: improving particular object retrieval in large scale image databases. In: Proceedings of IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2008.4587635"},{"key":"24_CR110","volume-title":"Head-driven phrase structure grammar","author":"C Pollard","year":"1994","unstructured":"Pollard C, Sag I (1994) Head-driven phrase structure grammar. Chicago University Press, Chicago"},{"issue":"6","key":"24_CR111","doi-asserted-by":"crossref","first-page":"976","DOI":"10.1016\/j.imavis.2009.11.014","volume":"28","author":"R Poppe","year":"2010","unstructured":"Poppe R (2010) Survey on vision-based human action recognition. Image Vision Comput 28(6):976\u2013990","journal-title":"Image Vision Comput"},{"key":"24_CR112","doi-asserted-by":"crossref","unstructured":"Rapantzikos K, Avrithis Y, Kollias S (2009) Dense saliency- based spatiotemporal feature points for action recognition. In: Proceedings of IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2009.5206525"},{"key":"24_CR113","doi-asserted-by":"crossref","unstructured":"Raptis M, Soatto S (2010) Tracklet descriptors for action modeling and video analysis. In: Proceedings of European conference on computer vision","DOI":"10.1007\/978-3-642-15549-9_42"},{"key":"24_CR114","doi-asserted-by":"crossref","unstructured":"Rodriguez MD, Ahmed J, Shah M (2008) Action mach: a spatio-temporal maximum average correlation height filter for action recognition. In: Procedings of IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2008.4587727"},{"issue":"2","key":"24_CR115","doi-asserted-by":"crossref","first-page":"99","DOI":"10.1023\/A:1026543900054","volume":"40","author":"Y Rubner","year":"2000","unstructured":"Rubner Y, Tomasi C, Guibas LJ (2000) The earth mover\u2019s distance as a metric for image retrieval. Int J Comput Vision 40(2):99\u2013 121","journal-title":"Int J Comput Vision"},{"issue":"1\u20133","key":"24_CR116","doi-asserted-by":"crossref","first-page":"157","DOI":"10.1007\/s11263-007-0090-8","volume":"77","author":"B Russell","year":"2008","unstructured":"Russell B, Torralba A, Murphy K, Freeman WT (2008) LabelMe: a database and web-based tool for image annotation. Int J Comput Vision 77(1\u20133):157\u2013173","journal-title":"Int J Comput Vision"},{"key":"24_CR117","doi-asserted-by":"crossref","unstructured":"Ryoo MS, Aggarwal JK (2006) Recognition of composite human activities through context-free grammar based representation. In: Proceedings pf IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2006.242"},{"issue":"10","key":"24_CR118","doi-asserted-by":"crossref","first-page":"1225","DOI":"10.1109\/TCSVT.2005.854237","volume":"15","author":"DA Sadlier","year":"2005","unstructured":"Sadlier DA, O\u2019Connor NE (2005) Event detection in field sports video using audio-visual features and a support vector machine. IEEE Trans Circ Syst Video Technol 15(10):1225\u20131233","journal-title":"IEEE Trans Circ Syst Video Technol"},{"issue":"9","key":"24_CR119","doi-asserted-by":"crossref","first-page":"1582","DOI":"10.1109\/TPAMI.2009.154","volume":"32","author":"KEA Sande van de","year":"2010","unstructured":"van de Sande KEA, Gevers T, Snoek CGM (2010) Evaluating color descriptors for object and scene recognition. IEEE Trans Pattern Anal Mach Intell 32(9):1582\u20131596","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"24_CR120","doi-asserted-by":"crossref","unstructured":"Satkin S, Hebert M (2010) Modeling the temporal extent of actions. In: Proceedings of European conference on computer vision","DOI":"10.1007\/978-3-642-15549-9_39"},{"key":"24_CR121","doi-asserted-by":"crossref","unstructured":"Schuldt C, Laptev I, Caputo B (2004) Recognizing human actions: a local SVM approach. In: Proceedings of international conference on pattern recognition","DOI":"10.1109\/ICPR.2004.1334462"},{"key":"24_CR122","doi-asserted-by":"crossref","unstructured":"Scovanner P, Ali S, Shah M (2007) A 3-dimensional SIFT descriptor and its application to action recognition. In: Proceedings of ACM international conference on multimedia","DOI":"10.1145\/1291233.1291311"},{"key":"24_CR123","doi-asserted-by":"crossref","unstructured":"Shechtman E, Irani M (2007) Matching local self-similarities across images and videos. In: Proceedings lo IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2007.383198"},{"key":"24_CR124","doi-asserted-by":"crossref","unstructured":"Shotton J, Johnson M, Cipolla R (2008) Semantic texton forests for image categorization and segmentation. In: Proceedings of IEEE conference on computer vision and pattern recognitio","DOI":"10.1109\/CVPR.2008.4587503"},{"key":"24_CR125","doi-asserted-by":"crossref","unstructured":"Si Z, Pei M, Yao B, Zhu SC (2011) Unsupervised learning of event and-or grammar and semantics from video. In: Proceedings IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2011.6126223"},{"key":"24_CR126","doi-asserted-by":"crossref","unstructured":"Silpa-Anan C, Hartley R (2008) Optimised KD-trees for fast image descriptor matching. In: IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2008.4587638"},{"key":"24_CR127","doi-asserted-by":"crossref","unstructured":"Sivic J, Zisserman A (2003) Video Google: a text retrieval approach to object matching in videos. In: Proceedings of IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2003.1238663"},{"key":"24_CR128","doi-asserted-by":"crossref","unstructured":"Smeaton AF, Over P, Kraaij W (2006) Evaluation campaigns and TRECVid. In: Proceedings of ACM international workshop on multimedia information retrieval","DOI":"10.1145\/1178677.1178722"},{"issue":"12","key":"24_CR129","doi-asserted-by":"crossref","first-page":"1349","DOI":"10.1109\/34.895972","volume":"22","author":"AWM Smeulders","year":"2000","unstructured":"Smeulders AWM, Worring M, Santini S, Gupta A, Jain R (2000) Content based image retrieval at the end of the early years. IEEE Trans Pattern Anal Mach Intell 22(12):1349\u20131380","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"4","key":"24_CR130","doi-asserted-by":"crossref","first-page":"215","DOI":"10.1561\/1500000014","volume":"2","author":"CGM Snoek","year":"2008","unstructured":"Snoek CGM, Worring M (2008) Concept-based video retrieval. Found Trends Inf Retr 2(4):215\u2013322","journal-title":"Found Trends Inf Retr"},{"key":"24_CR131","unstructured":"Starner TE (1995) Visual recognition of american sign language using hidden markov models. Ph.D. thesis"},{"key":"24_CR132","unstructured":"Sun J, Wu X, Yan S, Cheong LF, Chua TS, Li J (2009) Hierarchical spatio-temporal context modeling for action recognition. In: Proceedings of IEEE conference on computer vision and pattern recognition"},{"key":"24_CR133","unstructured":"Sun SW, Wang YCF, Hung YL, Chang CL, Chen KC, Cheng SS, Wang HM, Liao HYM (2011) Automatic annotation of web videos. In: Proceedings of IEEE international conference on multimedia and expo"},{"key":"24_CR134","doi-asserted-by":"crossref","unstructured":"Tan CC, Jiang YG, Ngo CW (2011) Towards textually describing complex video contents with audio-visual concept classifiers. In: Proceedings of ACM international conference on multimedia","DOI":"10.1145\/2072298.2072411"},{"key":"24_CR135","doi-asserted-by":"crossref","unstructured":"Taylor G, Fergus R, LeCun Y, Bregler C (2010) Convolutional learning of spatio-temporal features. In: Proceedings of European conference on computer vision","DOI":"10.1007\/978-3-642-15567-3_11"},{"key":"24_CR136","doi-asserted-by":"crossref","unstructured":"Torresani L, Szummer M, Fitzgibbon A (2010) Efficient object category recognition using classemes. In: Proceedings of European conference on computer vision","DOI":"10.1007\/978-3-642-15549-9_56"},{"key":"24_CR137","doi-asserted-by":"crossref","unstructured":"Tran SD, Davis LS (2008) Event modeling and recognition using markov logic networks. In: Proceedings of European conference on computer vision","DOI":"10.1007\/978-3-540-88688-4_45"},{"issue":"4","key":"24_CR138","doi-asserted-by":"crossref","first-page":"522","DOI":"10.1109\/76.915358","volume":"11","author":"S Tsekeridou","year":"2001","unstructured":"Tsekeridou S, Pitas I (2001) Content-based video parsing and indexing based on audio-visual interaction. IEEE Transactions on Circuits and Systems for Video Technology 11(4):522\u2013535","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"issue":"11","key":"24_CR139","doi-asserted-by":"crossref","first-page":"1473","DOI":"10.1109\/TCSVT.2008.2005594","volume":"18","author":"P Turaga","year":"2008","unstructured":"Turaga P, Chellappa R, Subrahmanian VS, Udrea O (2008) Machine recognition of human activities: a survey. IEEE Trans Circ Syst Video Technol 18(11):1473\u20131488","journal-title":"IEEE Trans Circ Syst Video Technol"},{"key":"24_CR140","doi-asserted-by":"crossref","unstructured":"Tuytelaars T (2010) Dense interest points. In: Proceedings of IEEE conference on computer vision and pattern recognition, pp 2281\u20132288","DOI":"10.1109\/CVPR.2010.5539911"},{"key":"24_CR141","doi-asserted-by":"crossref","unstructured":"Uemura H, Ishikawa S, Mikolajczyk K (2008) Feature tracking and motion compensation for action recognition. In: Proceedings British machine vision conference","DOI":"10.5244\/C.22.30"},{"key":"24_CR142","doi-asserted-by":"crossref","unstructured":"Uijlings JRR, Smeulders AWM, Scha RJH (2010) Real-time visual concept classification. IEEE Trans Multimedia 12(7): 665\u2013680","DOI":"10.1109\/TMM.2010.2052027"},{"key":"24_CR143","unstructured":"University of Central Florida 50 human action dataset (2010). http:\/\/server.cs.ucf.edu\/~ision\/data\/UCF50.rar"},{"key":"24_CR144","doi-asserted-by":"crossref","unstructured":"Vail DL, Veloso MM, Lafferty JD (2007) Conditional random fields for activity recognition. In: Proceedings of international joint conference on autonomous agents and multiagent systems","DOI":"10.1145\/1329125.1329409"},{"key":"24_CR145","doi-asserted-by":"crossref","unstructured":"Vedaldi A, Gulshan V, Varma M, Zisserman A (2009) Multiple kernels for object detection. In: Proceedings of IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2009.5459183"},{"key":"24_CR146","doi-asserted-by":"crossref","unstructured":"Vincent P, Larochelle H, Bengio Y, Manzagol PA (2008) Extracting and composing robust features with denoising autoencoders. In: Procedings of international conference on machine learning","DOI":"10.1145\/1390156.1390294"},{"issue":"12","key":"24_CR147","first-page":"3371","volume":"11","author":"P Vincent","year":"2010","unstructured":"Vincent P, Larochelle H, Lajoie I, Bengio Y, Manzagol PA (2010) Stacked denoising autoencoders: learning useful representations in a deep network with a local denoising criterion. J Mach Learn Res 11(12):3371\u20133408","journal-title":"J Mach Learn Res"},{"key":"24_CR148","doi-asserted-by":"crossref","unstructured":"Viola P, Jones M (2001) Rapid object detection using a boosted cascade of simple features. In; Proceedings of IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2001.990517"},{"key":"24_CR149","doi-asserted-by":"crossref","unstructured":"Wang F, Jiang YG, Ngo CW (2008) Video event detection using motion relativity and visual relatedness. In: Proceedings of ACM international conference on multimedia","DOI":"10.1145\/1459359.1459392"},{"key":"24_CR150","doi-asserted-by":"crossref","unstructured":"Wang H, Klaser A, Schmid C, Liu CL (2011) Action recognition by dense trajectories. In: Proceedings of IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2011.5995407"},{"key":"24_CR151","doi-asserted-by":"crossref","unstructured":"Wang H, Ullah MM, Klaser A, Laptev I, Schmid C (2008) Evaluation of local spatio-temporal features for action recognition. In: Proceedings of British machine vision conference","DOI":"10.5244\/C.23.124"},{"key":"24_CR152","doi-asserted-by":"crossref","unstructured":"Wang J, Kumar S, Chang SF (2010) Semi-supervised hashing for scalable image retrieval. In: Proceedings of IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2010.5539994"},{"key":"24_CR153","doi-asserted-by":"crossref","unstructured":"Wang L, Suter D (2007) Recognizing human activities from silhouettes: motion subspace and factorial discriminative graphical model. In: Proceedings of IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2007.383298"},{"key":"24_CR154","doi-asserted-by":"crossref","unstructured":"Wang Y, Mori G (2009) Max-margin hidden conditional random fields for human action recognition. In: Proceedings of IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2009.5206709"},{"issue":"2","key":"24_CR155","doi-asserted-by":"crossref","first-page":"249","DOI":"10.1016\/j.cviu.2006.07.013","volume":"104","author":"D Weinland","year":"2006","unstructured":"Weinland D, Ronfard R, Boyer E (2006) Free viewpoint action recognition using motion history volumes. Comput Vision Image Underst 104(2):249\u2013257","journal-title":"Comput Vision Image Underst"},{"key":"24_CR156","unstructured":"Weiss Y, Torralba A, Fergus R (2008) Spectral hashing. In: Proceedings of advances in neural information processing systems"},{"key":"24_CR157","doi-asserted-by":"crossref","unstructured":"White B, Yeh T, Lin J, Davis L (2009) Web-scale computer vision using mapreduce for multimedia data mining. In: Proceedings of ACM SIGKDD workshop on multimedia data mining","DOI":"10.1145\/1814245.1814254"},{"key":"24_CR158","doi-asserted-by":"crossref","unstructured":"Willems G, Tuytelaars T, van Gool L (2008) An efficient dense and scale-invariant spatio-temporal interest point detector. In: Proceedings European conference on computer vision","DOI":"10.1007\/978-3-540-88688-4_48"},{"key":"24_CR159","doi-asserted-by":"crossref","unstructured":"Wu S, Oreifej O, Shah M (2011) Action recognition in videos acquired by a moving camera using motion decomposition of lagrangian particle trajectories. In: Proceedings of IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2011.6126397"},{"issue":"7","key":"24_CR160","doi-asserted-by":"crossref","first-page":"767","DOI":"10.1016\/j.patrec.2004.01.005","volume":"25","author":"L Xie","year":"2004","unstructured":"Xie L, Xu P, Chang SF, Divakaran A, Sun H (2004) Structure analysis of soccer video with domain knowledge and hidden markov models. Pattern Recognit Lett 25(7):767\u2013775","journal-title":"Pattern Recognit Lett"},{"issue":"3","key":"24_CR161","doi-asserted-by":"crossref","first-page":"421","DOI":"10.1109\/TMM.2008.917346","volume":"10","author":"C Xu","year":"2008","unstructured":"Xu C, Wang J, Lu H, Zhang Y (2008) A novel framework for semantic annotation and personalized retrieval of sports video. IEEE Trans Multimedia 10(3):421\u2013436","journal-title":"IEEE Trans Multimedia"},{"issue":"11","key":"24_CR162","doi-asserted-by":"crossref","first-page":"1985","DOI":"10.1109\/TPAMI.2008.129","volume":"30","author":"D Xu","year":"2008","unstructured":"Xu D, Chang SF (2008) Video event recognition using Kernel methods with multilevel temporal alignment. IEEE Trans Pattern Anal Mach Intell 30(11):1985\u20131997","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"24_CR163","unstructured":"Xu M, Maddage NC, Xu C, Kankanhalli M, Tian Q (2003) Creating audio keywords for event detection in soccer video. In: Proceedings IEEE international conference on multimedia and expo"},{"key":"24_CR164","doi-asserted-by":"crossref","unstructured":"Yamato J, Ohya J, Ishii K (1992) Recognizing human action in time-sequential images using hidden markov model. In: Proceedings of IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.1992.223161"},{"key":"24_CR165","doi-asserted-by":"crossref","unstructured":"Yan R, Fleury MO, Merler M, Natsev A, Smith JR (2009) Large-scale multimedia semantic concept modeling using robust subspace bagging and mapreduce. In: Proceedings of ACM workshop on large-scale multimedia retrieval and mining","DOI":"10.1145\/1631058.1631067"},{"key":"24_CR166","volume-title":"Brief descriptions of visual features for baseline trecvid concept detectors","author":"A Yanagawa","year":"2006","unstructured":"Yanagawa A, Hsu W, Chang SF (2006) Brief descriptions of visual features for baseline trecvid concept detectors. Columbia University, Tech. rep."},{"issue":"8","key":"24_CR167","doi-asserted-by":"crossref","first-page":"1485","DOI":"10.1109\/JPROC.2010.2050411","volume":"98","author":"B Yao","year":"2010","unstructured":"Yao B, Yang X, Lin L, Lee M, Zhu S (2010) I2T: Image parsing to text description. Proc IEEE 98(8):1485\u20131508","journal-title":"Proc IEEE"},{"key":"24_CR168","doi-asserted-by":"crossref","unstructured":"Ye G, Jhuo IH, Liu D, Jiang YG, Chang SF (2012) Joint audio-visual bi-modal codewords for video event detection. In: Proceedings of ACM international conference on multimedia retrieval","DOI":"10.1145\/2324796.2324843"},{"key":"24_CR169","unstructured":"Ye G, Liu D, Jhuo IH, Chang SF (2012) Robust late fusion with rank minimization. In: Proceedings IEEE conference on computer vision and pattern recognition"},{"key":"24_CR170","doi-asserted-by":"crossref","unstructured":"Yu TH, Kim TK, Cipolla R (2010) Real-time action recognition by sptiotemoral semantic and structural forests. In: Proceedings of British machine vision conference","DOI":"10.5244\/C.24.52"},{"key":"24_CR171","unstructured":"Yuan F, Prinet V, Yuan J (2010) Middle-level representation for human activities recognition: the role of spatio-temporal relationships. In: Proceedings of ECCV Workshop on human motion: understanding, modeling, capture and animation"},{"key":"24_CR172","doi-asserted-by":"crossref","unstructured":"Yuen J, Russell BC, Liu C, Torralba A (2009) LabelMe video: building a video database with human annotations. In: Proceedings of international conference on computer vision","DOI":"10.1109\/ICCV.2009.5459289"},{"key":"24_CR173","doi-asserted-by":"crossref","unstructured":"Zhang D, Chang SF (2002) Event detection in baseball video using superimposed caption recognition. In: Proceedings of ACM international conference on multimedia","DOI":"10.1145\/641007.641073"},{"issue":"2","key":"24_CR174","doi-asserted-by":"crossref","first-page":"213","DOI":"10.1007\/s11263-006-9794-4","volume":"73","author":"J Zhang","year":"2007","unstructured":"Zhang J, Marszalek M, Lazebnik S, Schmid C (2007) Local features and kernels for classification of texture and object categories: a comprehensive study. Int J Comput Vision 73(2):213\u2013238","journal-title":"Int J Comput Vision"}],"container-title":["International Journal of Multimedia Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-012-0024-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s13735-012-0024-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-012-0024-2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,6,26]],"date-time":"2023-06-26T19:25:17Z","timestamp":1687807517000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s13735-012-0024-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012,11,13]]},"references-count":174,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2013,6]]}},"alternative-id":["24"],"URL":"https:\/\/doi.org\/10.1007\/s13735-012-0024-2","relation":{},"ISSN":["2192-6611","2192-662X"],"issn-type":[{"value":"2192-6611","type":"print"},{"value":"2192-662X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2012,11,13]]}}}