{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,3]],"date-time":"2025-06-03T19:40:02Z","timestamp":1748979602480,"version":"3.41.0"},"reference-count":87,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2016,5,21]],"date-time":"2016-05-21T00:00:00Z","timestamp":1463788800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/4.0"}],"funder":[{"DOI":"10.13039\/100000001","name":"US NSF","doi-asserted-by":"crossref","award":["1522954-IIS"],"award-info":[{"award-number":["1522954-IIS"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"crossref"}]},{"name":"DARPA\/ARL","award":["W911NF-10-2-0060"],"award-info":[{"award-number":["W911NF-10-2-0060"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Machine Vision and Applications"],"published-print":{"date-parts":[[2016,10]]},"DOI":"10.1007\/s00138-016-0768-4","type":"journal-article","created":{"date-parts":[[2016,5,24]],"date-time":"2016-05-24T02:52:36Z","timestamp":1464058356000},"page":"983-995","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Collecting and annotating the large continuous action dataset"],"prefix":"10.1007","volume":"27","author":[{"given":"Daniel Paul","family":"Barrett","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ran","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haonan","family":"Yu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jeffrey Mark","family":"Siskind","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2016,5,21]]},"reference":[{"key":"768_CR1","unstructured":"Barbu, A., Bridge, A., Burchill, Z., Coroian, D., Dickinson, S., Fidler, S., Michaux, A., Mussman, S., Siddharth, N., Salvi, D., Schmidt, L., Shangguan, J., Siskind, J.M., Waggoner, J., Wang, S., Wei, J., Yin, Y., Zhang, Z.: Video in sentences out. In: Uncertainty in Artificial Intelligence, pp. 102\u2013112 (2012)"},{"key":"768_CR2","first-page":"203","volume":"2","author":"A Barbu","year":"2012","unstructured":"Barbu, A., Siddharth, N., Michaux, A., Siskind, J.M.: Simultaneous object detection, tracking, and event recognition. Adv. Cogn. Syst. 2, 203\u2013220 (2012)","journal-title":"Adv. Cogn. Syst."},{"key":"768_CR3","doi-asserted-by":"publisher","unstructured":"Barrett, D.P., Siskind, J.M.: Action recognition by time-series of retinotopic appearance and motion features. IEEE Trans. Circuits Syst. Video Technol. (2015). doi: 10.1109\/TCSVT.2015.2502839","DOI":"10.1109\/TCSVT.2015.2502839"},{"key":"768_CR4","doi-asserted-by":"crossref","unstructured":"Blank, M., Gorelick, L., Shechtman, E., Irani, M., Basri, R.: Actions as space-time shapes. In: International Conference on Computer Vision vol. 2, pp. 1395\u20131402 (2005)","DOI":"10.1109\/ICCV.2005.28"},{"key":"768_CR5","doi-asserted-by":"crossref","unstructured":"Brand, M., Oliver, N., Pentland, A.: Coupled hidden Markov models for complex action recognition. In: Computer Vision and Pattern Recognition, pp. 994\u2013999 (1997)","DOI":"10.1109\/CVPR.1997.609450"},{"key":"768_CR6","doi-asserted-by":"crossref","unstructured":"Cao, Y., Barrett, D., Barbu, A., Narayanaswamy, S., Yu, H., Michaux, A., Lin, Y., Dickinson, S., Siskind, J.M., Wang, S.: Recognizing human activities from partially observed videos. In: Computer Vision and Pattern Recognition, pp. 2658\u20132665 (2013)","DOI":"10.1109\/CVPR.2013.343"},{"key":"768_CR7","doi-asserted-by":"crossref","unstructured":"Das, P., Xu, C., Doell, R.F., Corso, J.J.: A thousand frames in just a few words: Lingual description of videos through latent topics and sparse object stitching. In: Computer Vision and Pattern Recognition, pp. 2634\u20132641 (2013)","DOI":"10.1109\/CVPR.2013.340"},{"issue":"1\u20132","key":"768_CR8","doi-asserted-by":"crossref","first-page":"31","DOI":"10.1016\/j.artint.2005.06.007","volume":"167","author":"PF Dominey","year":"2005","unstructured":"Dominey, P.F., Boucher, J.D.: Learning to talk about events from narrated video in a construction grammar framework. Artif. Intell. 167(1\u20132), 31\u201361 (2005)","journal-title":"Artif. Intell."},{"key":"768_CR9","doi-asserted-by":"crossref","unstructured":"Efros, A., Berg, A., Mori, G., Malik, J.: Recognizing action at a distance. In: International Conference on Computer Vision, pp. 726\u2013733 (2003)","DOI":"10.1109\/ICCV.2003.1238420"},{"issue":"2","key":"768_CR10","doi-asserted-by":"crossref","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Van Gool, L., Williams, C.K.I., Winn, J., Zisserman, A.: The PASCAL visual object classes (VOC) challenge. Int. J. Comput. Vis. 88(2), 303\u2013338 (2010)","journal-title":"Int. J. Comput. Vis."},{"key":"768_CR11","doi-asserted-by":"crossref","unstructured":"Everts, I., van Gemert, J.C., Gevers, T.: Evaluation of color STIPs for human action recognition. In: Computer Vision and Pattern Recognition, pp. 2850\u20132857 (2013)","DOI":"10.1109\/CVPR.2013.367"},{"key":"768_CR12","doi-asserted-by":"crossref","unstructured":"Farneb\u00e4ck, G.: Two-frame motion estimation based on polynomial expansion. In: Image analysis, pp. 363\u2013370. Springer, Berlin (2003)","DOI":"10.1007\/3-540-45103-X_50"},{"key":"768_CR13","doi-asserted-by":"crossref","unstructured":"Fern\u00e1ndez\u00a0Tena, C., Baiget, P., Roca, X., Gonz\u00e0lez, J.: Natural language descriptions of human behavior from video sequences. In: Advances in Artificial Intelligence, pp. 279\u2013292 (2007)","DOI":"10.1007\/978-3-540-74565-5_22"},{"issue":"3","key":"768_CR14","doi-asserted-by":"crossref","first-page":"219","DOI":"10.1007\/s11263-013-0677-1","volume":"107","author":"A Gaidon","year":"2014","unstructured":"Gaidon, A., Harchaoui, Z., Schmid, C.: Activity representation with motion hierarchies. Int. J. Comput. Vis. 107(3), 219\u2013238 (2014)","journal-title":"Int. J. Comput. Vis."},{"key":"768_CR15","doi-asserted-by":"crossref","unstructured":"Gopalan, R.: Joint sparsity-based representation and analysis of unconstrained activities. In: Computer Vision and Pattern Recognition, pp. 2738\u20132745 (2013)","DOI":"10.1109\/CVPR.2013.353"},{"key":"768_CR16","doi-asserted-by":"crossref","unstructured":"Guadarrama, S., Krishnamoorthy, N., Malkarnenkar, G., Mooney, R., Darrell, T., Saenko, K.: Youtube2text: recognizing and describing arbitrary activities using semantic hierarchies and zero-shot recognition. In: International Conference on Computer Vision, pp. 2712\u20132719 (2013)","DOI":"10.1109\/ICCV.2013.337"},{"key":"768_CR17","doi-asserted-by":"crossref","unstructured":"Gupta, A., Davis, L.S.: Objects in action: an approach for combining action understanding and object perception. In: Computer Vision and Pattern Recognition, pp. 1\u20138 (2007)","DOI":"10.1109\/CVPR.2007.383331"},{"key":"768_CR18","doi-asserted-by":"crossref","unstructured":"Hanckmann, P., Schutte, K., Burghouts, G.J.: Automated textual descriptions for a wide range of video events with 48 human actions. In: European Conference on Computer Vision Workshops, pp. 372\u2013380 (2012)","DOI":"10.1007\/978-3-642-33863-2_37"},{"issue":"8","key":"768_CR19","doi-asserted-by":"crossref","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"768_CR20","doi-asserted-by":"crossref","unstructured":"Ikizler-Cinbis, N., Sclaroff, S.: Object, scene and actions: Combining multiple features for human action recognition. In: European Conference on Computer Vision, pp. 494\u2013507 (2010)","DOI":"10.1007\/978-3-642-15549-9_36"},{"key":"768_CR21","doi-asserted-by":"crossref","unstructured":"Izadinia, H., Sadeghi, F., Divvala, S.K., Hajishirzi, H., Choi, Y., Farhadi, A.: Segment-phrase table for semantic segmentation, visual entailment and paraphrasing. In: International Conference on Computer Vision, pp. 10\u201318 (2015)","DOI":"10.1109\/ICCV.2015.10"},{"key":"768_CR22","doi-asserted-by":"crossref","unstructured":"Jain, A., Gupta, A., Rodriguez, M., Davis, L.S.: Representing videos using mid-level discriminative patches. In: Computer Vision and Pattern Recognition, pp. 2571\u20132578 (2013)","DOI":"10.1109\/CVPR.2013.332"},{"key":"768_CR23","doi-asserted-by":"crossref","unstructured":"J\u00e9gou, H., Douze, M., Schmid, C., P\u00e9rez, P.: Aggregating local descriptors into a compact image representation. In: Computer Vision and Pattern Recognition, pp. 3304\u20133311 (2010)","DOI":"10.1109\/CVPR.2010.5540039"},{"key":"768_CR24","doi-asserted-by":"crossref","unstructured":"Jhuang, H., Serre, T., Wolf, L., Poggio, T.: A biologically inspired system for action recognition. In: International Conference on Computer Vision, pp. 1\u20138 (2007)","DOI":"10.1109\/ICCV.2007.4408988"},{"key":"768_CR25","unstructured":"Jiang, Y.G., Liu, J., Roshan\u00a0Zamir, A., Toderici, G., Laptev, I., Shah, M., Sukthankar, R.: THUMOS challenge: action recognition with a large number of classes. http:\/\/crcv.ucf.edu\/THUMOS14\/ (2014)"},{"key":"768_CR26","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., Fei-Fei, L.: Large-scale video classification with convolutional neural networks. In: Computer Vision and Pattern Recognition, pp. 1725\u20131732 (2014)","DOI":"10.1109\/CVPR.2014.223"},{"key":"768_CR27","doi-asserted-by":"crossref","unstructured":"Ke, Y., Sukthankar, R., Hebert, M.: Event detection in crowded videos. In: International Conference on Computer Vision, pp. 1\u20138 (2007)","DOI":"10.1109\/ICCV.2007.4409011"},{"key":"768_CR28","unstructured":"Khan, M.U.G., Gotoh, Y.: Describing video contents in natural language. In: Workshop on Innovative Hybrid Approaches to the Processing of Textual Data, pp. 27\u201335 (2012)"},{"key":"768_CR29","doi-asserted-by":"crossref","unstructured":"Khan, M.U.G., Zhang, L., Gotoh, Y.: Human focused video description. In: International Conference on Computer Vision Workshops, pp. 1480\u20131487 (2011)","DOI":"10.1109\/ICCVW.2011.6130425"},{"key":"768_CR30","doi-asserted-by":"crossref","unstructured":"Khan, M.U.G., Zhang, L., Gotoh, Y.: Towards coherent natural language description of video streams. In: International Conference on Computer Vision Workshops, pp. 664\u2013671 (2011)","DOI":"10.1109\/ICCVW.2011.6130306"},{"issue":"2","key":"768_CR31","doi-asserted-by":"crossref","first-page":"171","DOI":"10.1023\/A:1020346032608","volume":"50","author":"A Kojima","year":"2002","unstructured":"Kojima, A., Tamura, T., Fukunaga, K.: Natural language description of human activities from video images based on concept hierarchy of actions. Int. J. Comput. Vis. 50(2), 171\u2013184 (2002)","journal-title":"Int. J. Comput. Vis."},{"key":"768_CR32","doi-asserted-by":"crossref","unstructured":"Krishnamoorthy, N., Malkarnenkar, G., Mooney, R.J., Saenko, K., Guadarrama, S.: Generating natural-language video descriptions using text-mined knowledge. In: Conference on Artificial Intelligence, pp. 541\u2013547 (2013)","DOI":"10.1609\/aaai.v27i1.8679"},{"key":"768_CR33","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: HMDB: a large video database for human motion recognition. In: International Conference on Computer Vision, pp. 2556\u20132563 (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"issue":"2\u20133","key":"768_CR34","doi-asserted-by":"crossref","first-page":"107","DOI":"10.1007\/s11263-005-1838-7","volume":"64","author":"I Laptev","year":"2005","unstructured":"Laptev, I.: On space\u2013time interest points. Int. J. Comput. Vis. 64(2\u20133), 107\u2013123 (2005)","journal-title":"Int. J. Comput. Vis."},{"key":"768_CR35","doi-asserted-by":"crossref","unstructured":"Laptev, I., Marsza\u0142ek, M., Schmid, C., Rozenfeld, B.: Learning realistic human actions from movies. In: Computer Vision and Pattern Recognition, pp. 1\u20138 (2008)","DOI":"10.1109\/CVPR.2008.4587756"},{"key":"768_CR36","doi-asserted-by":"crossref","unstructured":"Le, Q.V., Zou, W.Y., Yeung, S.Y., Ng, A.Y.: Learning hierarchical invariant spatio-temporal features for action recognition with independent subspace analysis. In: Computer Vision and Pattern Recognition, pp. 3361\u20133368 (2011)","DOI":"10.1109\/CVPR.2011.5995496"},{"key":"768_CR37","doi-asserted-by":"crossref","unstructured":"Li, L.J., Fei-Fei, L.: What, where and who? Classifying events by scene and object recognition. In: International Conference on Computer Vision, pp. 1\u20138 (2007)","DOI":"10.1109\/ICCV.2007.4408872"},{"key":"768_CR38","doi-asserted-by":"crossref","unstructured":"Lin, Z., Jiang, Z., Davis, L.: Recognizing actions by shape-motion prototype trees. In: International Conference on Computer Vision, pp. 444\u2013451 (2009)","DOI":"10.1109\/ICCV.2009.5459184"},{"key":"768_CR39","doi-asserted-by":"crossref","first-page":"411","DOI":"10.1007\/978-0-85729-997-0_20","volume-title":"Visual Analysis of Humans: Looking at People, Chapter\u00a020","author":"H Liu","year":"2011","unstructured":"Liu, H., Feris, R., Sun, M.T.: Benchmarking datasets for human activity recognition. In: Moeslund, T.B., Hilton, A., Kr\u00fcger, V., Sigal, L. (eds.) Visual Analysis of Humans: Looking at People, Chapter\u00a020, pp. 411\u2013427. Springer, Berlin (2011)"},{"key":"768_CR40","doi-asserted-by":"crossref","unstructured":"Liu, J., Kuipers, B., Savarese, S.: Recognizing human actions by attributes. In: Computer Vision and Pattern Recognition, pp. 3337\u20133344 (2011)","DOI":"10.1109\/CVPR.2011.5995353"},{"key":"768_CR41","doi-asserted-by":"crossref","unstructured":"Liu, J., Luo, J., Shah, M.: Recognizing realistic actions from videos \u201cin the wild\u201d. In: Computer Vision and Pattern Recognition, pp. 1996\u20132003 (2009)","DOI":"10.1109\/CVPR.2009.5206744"},{"key":"768_CR42","doi-asserted-by":"crossref","unstructured":"Maji, S., Bourdev, L., Malik, J.: Action recognition from a distributed representation of pose and appearance. In: Computer Vision and Pattern Recognition, pp. 3177\u20133184 (2011)","DOI":"10.1109\/CVPR.2011.5995631"},{"key":"768_CR43","doi-asserted-by":"crossref","unstructured":"Marsza\u0142ek, M., Laptev, I., Schmid, C.: Actions in context. In: Computer Vision and Pattern Recognition, pp. 2929\u20132936 (2009)","DOI":"10.1109\/CVPR.2009.5206557"},{"key":"768_CR44","doi-asserted-by":"crossref","unstructured":"Messing, R., Pal, C., Kautz, H.: Activity recognition using the velocity histories of tracked keypoints. In: International Conference on Computer Vision, pp. 104\u2013111 (2009)","DOI":"10.1109\/ICCV.2009.5459154"},{"key":"768_CR45","doi-asserted-by":"crossref","unstructured":"Moore, D.J., Essa, I.A., Heyes, M.H.: Exploiting human actions and object context for recognition tasks. In: International Conference on Computer Vision, pp. 80\u201386 (1999)","DOI":"10.1109\/ICCV.1999.791201"},{"key":"768_CR46","doi-asserted-by":"crossref","unstructured":"Ng, J.Y.H., Hausknecht, M., Vijayanarasimhan, S., Vinyals, O., Monga, R., Toderici, G.: Beyond short snippets: Deep networks for video classification. In: Computer Vision and Pattern Recognition, pp. 4694\u20134702 (2015)","DOI":"10.1109\/CVPR.2015.7299101"},{"key":"768_CR47","doi-asserted-by":"crossref","unstructured":"Niebles, J.C., Chen, C.W., Fei-Fei, L.: Modeling temporal structure of decomposable motion segments for activity classification. In: European Conference on Computer Vision, pp. 392\u2013405 (2010)","DOI":"10.1007\/978-3-642-15552-9_29"},{"key":"768_CR48","doi-asserted-by":"crossref","unstructured":"Oh, S., Hoogs, A., Perera, A., Cuntoor, N., Chen, C.C., Lee, J.T., Mukherjee, S., Aggarwal, J.K., Lee, H., Davis, L., Swears, E., Wang, X., Ji, Q., Reddy, K., Shah, M., Vondrick, C., Pirsiavash, H., Ramanan, D., Yuen, J., Torralba, A., Song, B., Fong, A., Roy-Chowdhury, A., Desai, M.: A large-scale benchmark dataset for event recognition in surveillance video. In: Computer Vision and Pattern Recognition, pp. 3153\u20133160 (2011)","DOI":"10.1109\/CVPR.2011.5995586"},{"key":"768_CR49","doi-asserted-by":"crossref","unstructured":"Oneata, D., Verbeek, J., Schmid, C.: Action and event recognition with Fisher vectors on a compact feature set. In: International Conference on Computer Vision, pp. 1817\u20131824 (2013)","DOI":"10.1109\/ICCV.2013.228"},{"key":"768_CR50","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-319-04184-1","volume-title":"Robust Subspace Estimation Using Low-Rank Optimization, Chapter\u00a05","author":"O Oreifej","year":"2014","unstructured":"Oreifej, O., Shah, M.: Robust Subspace Estimation Using Low-Rank Optimization, Chapter\u00a05. Springer, Berlin (2014)"},{"key":"768_CR51","doi-asserted-by":"crossref","unstructured":"Perronnin, F., S\u00e1nchez, J., Mensink, T.: Improving the fisher kernel for large-scale image classification. In: European Conference on Computer Vision, pp. 143\u2013156 (2010)","DOI":"10.1007\/978-3-642-15561-1_11"},{"issue":"5","key":"768_CR52","doi-asserted-by":"crossref","first-page":"971","DOI":"10.1007\/s00138-012-0450-4","volume":"24","author":"KK Reddy","year":"2013","unstructured":"Reddy, K.K., Shah, M.: Recognizing 50 human action categories of web videos. Mach. Vis. Appl. 24(5), 971\u2013981 (2013)","journal-title":"Mach. Vis. Appl."},{"key":"768_CR53","doi-asserted-by":"crossref","unstructured":"Rodriguez, M.D., Ahmed, J., Shah, M.: Action MACH: A spatio-temporal maximum average correlation height filter for action recognition. In: Computer Vision and Pattern Recognition, pp. 1\u20138 (2008)","DOI":"10.1109\/CVPR.2008.4587727"},{"key":"768_CR54","doi-asserted-by":"crossref","unstructured":"Rohrbach, M., Qiu, W., Titov, I., Thater, S., Pinkal, M., Schiele, B.: Translating video content to natural language descriptions. In: International Conference on Computer Vision, pp. 433\u2013440 (2013)","DOI":"10.1109\/ICCV.2013.61"},{"issue":"3","key":"768_CR55","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, Z., Karpathy, A., Khosla, A., Bernstein, M., Berg, A.C., Fei-Fei, L.: Imagenet large scale visual recognition challenge. Int. J. Comput. Vis. 115(3), 1\u201342 (2015)","journal-title":"Int. J. Comput. Vis."},{"key":"768_CR56","doi-asserted-by":"crossref","unstructured":"Ryoo, M.S.: Human activity prediction: early recognition of ongoing activities from streaming videos. In: International Conference on Computer Vision, pp. 1036\u20131043 (2011)","DOI":"10.1109\/ICCV.2011.6126349"},{"key":"768_CR57","doi-asserted-by":"crossref","unstructured":"Sadanand, S., Corso, J.J.: Action bank: a high-level representation of activity in video. In: Computer Vision and Pattern Recognition, pp. 1234\u20131241 (2012)","DOI":"10.1109\/CVPR.2012.6247806"},{"key":"768_CR58","doi-asserted-by":"crossref","unstructured":"Schuldt, C., Laptev, I., Caputo, B.: Recognizing human actions: a local SVM approach. In: International Conference on Pattern Recognition, pp. 32\u201336 (2004)","DOI":"10.1109\/ICPR.2004.1334462"},{"key":"768_CR59","doi-asserted-by":"crossref","unstructured":"Siddharth, N., Barbu, A., Siskind, J.M.: Seeing what you\u2019re told: sentence-guided activity recognition in video. In: Computer Vision and Pattern Recognition, pp. 732\u2013739 (2014)","DOI":"10.1109\/CVPR.2014.99"},{"key":"768_CR60","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: International Conference on Learning Representations (2015)"},{"key":"768_CR61","unstructured":"Siskind, J.M.: Visual event classification via force dynamics. In: Conference on Artificial Intelligence, pp. 149\u2013155 (2000)"},{"key":"768_CR62","doi-asserted-by":"crossref","unstructured":"Siskind, J.M., Morris, Q.: A maximum-likelihood approach to visual event classification. In: European Conference on Computer Vision, pp. 347\u2013360 (1996)","DOI":"10.1007\/3-540-61123-1_152"},{"key":"768_CR63","doi-asserted-by":"crossref","unstructured":"Smeaton, A., Over, P., Kraaij, W.: Evaluation campaigns and TRECVID. In: ACM International Conference on Multimedia Information Retrieval, pp. 321\u2013330 (2006)","DOI":"10.1145\/1178677.1178722"},{"key":"768_CR64","doi-asserted-by":"crossref","unstructured":"Song, Y., Morency, L.P., Davis, R.: Action recognition by hierarchical sequence summarization. In: Computer Vision and Pattern Recognition, pp. 3562\u20133569 (2013)","DOI":"10.1109\/CVPR.2013.457"},{"key":"768_CR65","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: A dataset of 101 human actions classes from videos in the wild. Tech. Rep. arXiv:1212.0402 (2012)"},{"key":"768_CR66","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I., Salakhutdinov, R.: Dropout: a simple way to prevent neural networks from overfitting. J. Mach. Learn. Res. 15, 1929\u20131958 (2014)","journal-title":"J. Mach. Learn. Res."},{"key":"768_CR67","doi-asserted-by":"crossref","unstructured":"Tang, K., Fei-Fei, L., Koller, D.: Learning latent temporal structure for complex event detection. In: Computer Vision and Pattern Recognition, pp. 1250\u20131257 (2012)","DOI":"10.1109\/CVPR.2012.6247808"},{"key":"768_CR68","doi-asserted-by":"crossref","unstructured":"Tian, Y., Sukthankar, R., Shah, M.: Spatiotemporal deformable part models for action detection. In: Computer Vision and Pattern Recognition, pp. 2642\u20132649 (2013)","DOI":"10.1109\/CVPR.2013.341"},{"key":"768_CR69","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3d convolutional networks. In: International Conference on Computer Vision, pp. 4489\u20134497 (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"768_CR70","doi-asserted-by":"crossref","unstructured":"Uemura, H., Ishikawa, S., Mikolajczyk, K.: Feature tracking and motion compensation for action recognition. In: British Machine Vision Conference, pp. 1\u201310 (2008)","DOI":"10.5244\/C.22.30"},{"key":"768_CR71","doi-asserted-by":"crossref","unstructured":"Wang, C., Wang, Y., Yuille, A.L.: An approach to pose-based action recognition. In: Computer Vision and Pattern Recognition, pp. 915\u2013922 (2013)","DOI":"10.1109\/CVPR.2013.123"},{"key":"768_CR72","doi-asserted-by":"crossref","unstructured":"Wang, H., Kl\u00e4ser, A., Schmid, C., Liu, C.L.: Action recognition by dense trajectories. In: Computer Vision and Pattern Recognition, pp. 3169\u20133176 (2011)","DOI":"10.1109\/CVPR.2011.5995407"},{"issue":"1","key":"768_CR73","doi-asserted-by":"crossref","first-page":"60","DOI":"10.1007\/s11263-012-0594-8","volume":"103","author":"H Wang","year":"2013","unstructured":"Wang, H., Kl\u00e4ser, A., Schmid, C., Liu, C.L.: Dense trajectories and motion boundary descriptors for action recognition. Int. J. Comput. Vis. 103(1), 60\u201379 (2013)","journal-title":"Int. J. Comput. Vis."},{"key":"768_CR74","doi-asserted-by":"crossref","unstructured":"Wang, H., Schmid, C.: Action recognition with improved trajectories. In: International Conference on Computer Vision, pp. 3551\u20133558 (2013)","DOI":"10.1109\/ICCV.2013.441"},{"issue":"10","key":"768_CR75","doi-asserted-by":"crossref","first-page":"1762","DOI":"10.1109\/TPAMI.2009.43","volume":"31","author":"Y Wang","year":"2009","unstructured":"Wang, Y., Mori, G.: Human action recognition by semilatent topic models. IEEE Trans. Pattern Anal. Mach. Intell. 31(10), 1762\u20131674 (2009)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"768_CR76","doi-asserted-by":"crossref","unstructured":"Wang, Z., Kuruoglu, E.E., Yang, X., Xu, Y., Yu, S.: Event recognition with time varying hidden Markov model. In: International Conference on Accoustic and Speech Signal Processing, pp. 1761\u20131764 (2009)","DOI":"10.1109\/ICASSP.2009.4959945"},{"key":"768_CR77","doi-asserted-by":"crossref","unstructured":"Willems, G., Tuytelaars, T., Van\u00a0Gool, L.: An efficient dense and scale-invariant spatio-temporal interest point detector. In: European Conference on Computer Vision, pp. 650\u2013663 (2008)","DOI":"10.1007\/978-3-540-88688-4_48"},{"key":"768_CR78","doi-asserted-by":"crossref","unstructured":"Wu, X., Xu, D., Duan, L., Luo, J.: Action recognition using context and appearance distribution features. In: Computer Vision and Pattern Recognition, pp. 489\u2013496 (2011)","DOI":"10.1109\/CVPR.2011.5995624"},{"key":"768_CR79","unstructured":"Xu, G., Ma, Y.F., Zhang, H., Yang, S.: Motion based event recognition using HMM. In: International Conference on Pattern Recognition, pp. 831\u2013834 (2002)"},{"key":"768_CR80","doi-asserted-by":"crossref","unstructured":"Xu, Z., Yang, Y., Hauptmann, A.G.: A discriminative CNN video representation for event detection. In: Computer Vision and Pattern Recognition, pp. 1798\u20131807 (2015)","DOI":"10.1109\/CVPR.2015.7298789"},{"key":"768_CR81","doi-asserted-by":"crossref","unstructured":"Yamoto, J., Ohya, J., Ishii, K.: Recognizing human action in time-sequential images using hidden Markov model. In: Computer Vision and Pattern Recognition, pp. 379\u2013385 (1992)","DOI":"10.1109\/CVPR.1992.223161"},{"key":"768_CR82","doi-asserted-by":"crossref","first-page":"601","DOI":"10.1613\/jair.4556","volume":"52","author":"H Yu","year":"2015","unstructured":"Yu, H., Siddharth, N., Barbu, A., Siskind, J.M.: A compositional framework for grounding language inference, generation, and acquisition in video. J. Artif. Intell. Res. 52, 601\u2013713 (2015)","journal-title":"J. Artif. Intell. Res."},{"key":"768_CR83","unstructured":"Yu, H., Siskind, J.M.: Grounded language learning from video described with sentences. In: Annual Meeting of the Association for Computational Linguistics, pp. 53\u201363 (2013)"},{"key":"768_CR84","doi-asserted-by":"crossref","unstructured":"Yuan, C., Hu, W., Tian, G., Yang, S., Wang, H.: Multi-task sparse learning with Beta process prior for action recognition. In: Computer Vision and Pattern Recognition, pp. 423\u2013429 (2013)","DOI":"10.1109\/CVPR.2013.61"},{"key":"768_CR85","doi-asserted-by":"crossref","unstructured":"Yuan, C., Li, X., Hu, W., Ling, H., Maybank, S.: 3D R transform on spatio-temporal interest points for action recognition. In: Computer Vision and Pattern Recognition, pp. 724\u2013730 (2013)","DOI":"10.1109\/CVPR.2013.99"},{"key":"768_CR86","doi-asserted-by":"crossref","unstructured":"Yuan, J., Liu, Z., Wu, Y.: Discriminative subvolume search for efficient action detection. In: Computer Vision and Pattern Recognition, pp. 2442\u20132449 (2009)","DOI":"10.1109\/CVPR.2009.5206671"},{"key":"768_CR87","doi-asserted-by":"crossref","unstructured":"Zhu, J., Wang, B., Yang, X., Zhang, W., Tu, Z.: Action recognition with Actons. In: International Conference on Computer Vision, pp. 3559\u20133566 (2013)","DOI":"10.1109\/ICCV.2013.442"}],"container-title":["Machine Vision and Applications"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-016-0768-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s00138-016-0768-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-016-0768-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-016-0768-4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,3]],"date-time":"2025-06-03T19:04:55Z","timestamp":1748977495000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s00138-016-0768-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016,5,21]]},"references-count":87,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2016,10]]}},"alternative-id":["768"],"URL":"https:\/\/doi.org\/10.1007\/s00138-016-0768-4","relation":{},"ISSN":["0932-8092","1432-1769"],"issn-type":[{"type":"print","value":"0932-8092"},{"type":"electronic","value":"1432-1769"}],"subject":[],"published":{"date-parts":[[2016,5,21]]}}}