{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,14]],"date-time":"2026-02-14T02:50:57Z","timestamp":1771037457379,"version":"3.50.1"},"reference-count":96,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2018,8,19]],"date-time":"2018-08-19T00:00:00Z","timestamp":1534636800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2019,4]]},"DOI":"10.1007\/s11263-018-1111-5","type":"journal-article","created":{"date-parts":[[2018,8,19]],"date-time":"2018-08-19T02:38:49Z","timestamp":1534646329000},"page":"340-362","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":29,"title":["Second-order Temporal Pooling for Action Recognition"],"prefix":"10.1007","volume":"127","author":[{"given":"Anoop","family":"Cherian","sequence":"first","affiliation":[]},{"given":"Stephen","family":"Gould","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,8,19]]},"reference":[{"issue":"2","key":"1111_CR1","doi-asserted-by":"publisher","first-page":"411","DOI":"10.1002\/mrm.20965","volume":"56","author":"V Arsigny","year":"2006","unstructured":"Arsigny, V., Fillard, P., Pennec, X., & Ayache, N. (2006). Log-euclidean metrics for fast and simple calculus on diffusion tensors. Magnetic Resonance in Medicine, 56(2), 411\u2013421.","journal-title":"Magnetic Resonance in Medicine"},{"key":"1111_CR2","doi-asserted-by":"crossref","unstructured":"Baccouche, M., Mamalet, F., Wolf, C., Garcia, C., & Baskurt, A. (2011). Sequential deep learning for human action recognition. In Human Behavior Understanding, pp 29\u201339.","DOI":"10.1007\/978-3-642-25446-8_4"},{"key":"1111_CR3","doi-asserted-by":"crossref","unstructured":"Bilen, H., Fernando, B., Gavves, E., Vedaldi, A., & Gould, S. (2016). Dynamic image networks for action recognition. In CVPR.","DOI":"10.1109\/CVPR.2016.331"},{"key":"1111_CR4","unstructured":"Blank, M., Gorelick, L., Shechtman, E., Irani, M., & Basri, R. (2005). Actions as space-time shapes. IEEE: In ICCV."},{"key":"1111_CR5","doi-asserted-by":"crossref","unstructured":"Bojanowski, P., Lajugie, R., Bach, F., Laptev, I., Ponce, J., Schmid, C., & Sivic, J. (2014). Weakly supervised action labeling in videos under ordering constraints. In ECCV.","DOI":"10.1007\/978-3-319-10602-1_41"},{"key":"1111_CR6","doi-asserted-by":"crossref","unstructured":"Cai, Z., Wang, L., Peng, X., & Qiao, Y. (2014). Multi-view super vector for action recognition. In CVPR.","DOI":"10.1109\/CVPR.2014.83"},{"key":"1111_CR7","doi-asserted-by":"crossref","unstructured":"Carreira, J., & Zisserman, A. (2017). Quo vadis, action recognition? a new model and the kinetics dataset. In CVPR, pp. 4724\u20134733. IEEE.","DOI":"10.1109\/CVPR.2017.502"},{"key":"1111_CR8","doi-asserted-by":"crossref","unstructured":"Carreira, J., Caseiro, R., Batista, J., & Sminchisescu, C. (2012). Semantic segmentation with second-order pooling. In ECCV.","DOI":"10.1007\/978-3-642-33786-4_32"},{"issue":"6","key":"1111_CR9","doi-asserted-by":"publisher","first-page":"633","DOI":"10.1016\/j.cviu.2013.01.013","volume":"117","author":"JM Chaquet","year":"2013","unstructured":"Chaquet, J. M., Carmona, E. J., & Fern\u00e1ndez-Caballero, A. (2013). A survey of video datasets for human action and activity recognition. Computer Vision and Image Understanding, 117(6), 633\u2013659.","journal-title":"Computer Vision and Image Understanding"},{"key":"1111_CR10","unstructured":"Chatfield, K., Simonyan, K., Vedaldi, A, & Zisserman, A. (2014). Return of the devil in the details: Delving deep into convolutional nets. arXiv preprint arXiv:1405.3531 ."},{"key":"1111_CR11","unstructured":"Chen, X., & Yuille, A.L. (2014). Articulated pose estimation by a graphical model with image dependent pairwise relations. In NIPS."},{"issue":"9","key":"1111_CR12","doi-asserted-by":"publisher","first-page":"2161","DOI":"10.1109\/TPAMI.2012.259","volume":"35","author":"A Cherian","year":"2013","unstructured":"Cherian, A., Sra, S., Banerjee, A., & Papanikolopoulos, N. (2013). Jensen-bregman logdet divergence with application to efficient similarity search for covariance matrices. PAMI, 35(9), 2161\u20132174.","journal-title":"PAMI"},{"key":"1111_CR13","doi-asserted-by":"crossref","unstructured":"Cherian, A., Fernando, B., Harandi, M., & Gould, S. (2017a). Generalized rank pooling for action recognition. In CVPR.","DOI":"10.1109\/CVPR.2017.172"},{"key":"1111_CR14","doi-asserted-by":"crossref","unstructured":"Cherian, A., Koniusz, P., & Gould, S. (2017b). Higher-order pooling of CNN features via kernel linerization for action recognition. In WACV.","DOI":"10.1109\/WACV.2017.22"},{"key":"1111_CR15","doi-asserted-by":"crossref","unstructured":"Cherian, A., Sra, S., Gould, S., & Hartley, R. (2018). Non-linear temporal subspace representations for activity recognition. In CVPR, pp 2197\u20132206.","DOI":"10.1109\/CVPR.2018.00234"},{"key":"1111_CR16","unstructured":"Ch\u00e9ron, G., Laptev, I., & Schmid, C.. (2015). P-CNN: Pose-based CNN features for action recognition. arXiv preprint arXiv:1506.03607 ."},{"key":"1111_CR17","unstructured":"Davis, J. W., & Bobick, A. F. (1997). The representation and recognition of human movement using temporal templates. IEEE: In CVPR."},{"key":"1111_CR18","doi-asserted-by":"crossref","unstructured":"Donahue, J., Anne Hendricks, L., Guadarrama, S., Rohrbach, M., Venugopalan, S., Saenko, K., & Darrell, T. (2014). Long-term recurrent convolutional networks for visual recognition and description. arXiv preprint arXiv:1411.4389 .","DOI":"10.21236\/ADA623249"},{"key":"1111_CR19","doi-asserted-by":"crossref","unstructured":"Duchenne, O., Laptev, I., Sivic, J., Bach, F., & Ponce, J. (2009). Automatic annotation of human actions in video. In ICCV.","DOI":"10.1109\/ICCV.2009.5459279"},{"key":"1111_CR20","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Pinz, A., & Wildes, R. (2016a). Spatiotemporal residual networks for video action recognition. In NIPS.","DOI":"10.1109\/CVPR.2017.787"},{"key":"1111_CR21","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Pinz, A., & Zisserman, A. (2016b). Convolutional two-stream network fusion for video action recognition. In CVPR.","DOI":"10.1109\/CVPR.2016.213"},{"key":"1111_CR22","unstructured":"Feichtenhofer, C,, Pinz, A., & Wildes, R. P. (2017). Spatiotemporal multiplier networks for video action recognition. IEEE: In CVPR."},{"key":"1111_CR23","doi-asserted-by":"crossref","unstructured":"Fernando, B., Gavves, E., Oramas, J. M., Ghodrati, A., & Tuytelaars, T. (2015a). Modeling video evolution for action recognition. In CVPR.","DOI":"10.1109\/CVPR.2015.7299176"},{"key":"1111_CR24","doi-asserted-by":"crossref","unstructured":"Fernando, B., Gavves, E., Oramas, J. M., Ghodrati, A., & Tuytelaars, T. (2015b). Modeling video evolution for action recognition. In CVPR.","DOI":"10.1109\/CVPR.2015.7299176"},{"issue":"11","key":"1111_CR25","doi-asserted-by":"publisher","first-page":"2188","DOI":"10.1109\/TPAMI.2011.70","volume":"33","author":"J Gall","year":"2011","unstructured":"Gall, J., Yao, A., Razavi, N., Van Gool, L., & Lempitsky, V. (2011). Hough forests for object detection, tracking, and action recognition. PAMI, 33(11), 2188\u20132202.","journal-title":"PAMI"},{"key":"1111_CR26","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Ramanan, D., Gupta, A., Sivic, J., & Russell, B. (2017). Actionvlad: Learning spatio-temporal aggregation for action classification. In CVPR, volume 2, p. 3.","DOI":"10.1109\/CVPR.2017.337"},{"key":"1111_CR27","doi-asserted-by":"crossref","unstructured":"Gkioxari, G., & Malik, J. (2015). Finding action tubes. In CVPR.","DOI":"10.1109\/CVPR.2015.7298676"},{"key":"1111_CR28","unstructured":"Gu, C., Sun, C., Ross, D. A., Vondrick, C., Pantofaru, C., Li, Y., Vijayanarasimhan, S., Toderici, G., Ricco, S., Sukthankar, R., et al. (2017). AVA: A video dataset of spatio-temporally localized atomic visual actions. CoRR, abs\/1705.08421, 4."},{"key":"1111_CR29","unstructured":"Guo, K., Ishwar, P., & Konrad, J. (2013). Action recognition from video using feature covariance matrices. In TIP."},{"key":"1111_CR30","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In CVPR.","DOI":"10.1109\/CVPR.2016.90"},{"key":"1111_CR31","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1016\/j.imavis.2017.01.010","volume":"60","author":"S Herath","year":"2017","unstructured":"Herath, S., Harandi, M., & Porikli, F. (2017). Going deeper into action recognition: A survey. Image and Vision Computing, 60, 4\u201321. ISSN 0262-8856. Regularization Techniques for High-Dimensional Data Analysis.","journal-title":"Image and Vision Computing"},{"issue":"4","key":"1111_CR32","doi-asserted-by":"publisher","first-page":"382","DOI":"10.1214\/ss\/1009212519","volume":"14","author":"JA Hoeting","year":"1999","unstructured":"Hoeting, J. A., Madigan, D., Raftery, A. E., & Volinsky, C. T. (1999). Bayesian model averaging: A tutorial. Statistical Science, 14(4), 382\u2013401.","journal-title":"Statistical Science"},{"key":"1111_CR33","doi-asserted-by":"crossref","unstructured":"Huang, Z., & Van Gool, L. (2017). A riemannian network for spd matrix learning. In AAAI.","DOI":"10.1609\/aaai.v31i1.10866"},{"key":"1111_CR34","doi-asserted-by":"crossref","unstructured":"Ionescu, C., Vantzos, O. & Sminchisescu, C. (2015). Matrix backpropagation for deep networks with structured layers. In ICCV.","DOI":"10.1109\/ICCV.2015.339"},{"key":"1111_CR35","doi-asserted-by":"crossref","unstructured":"Jebara, T., & Kondor, R. (2003). Bhattacharyya and expected likelihood kernels. In Learning theory and kernel machines, pp. 57\u201371. Springer.","DOI":"10.1007\/978-3-540-45167-9_6"},{"key":"1111_CR36","doi-asserted-by":"crossref","unstructured":"J\u00e9gou, H., Douze, M., & Schmid, C. (2009). On the burstiness of visual elements. In CVPR.","DOI":"10.1109\/CVPR.2009.5206609"},{"issue":"1","key":"1111_CR37","doi-asserted-by":"publisher","first-page":"117","DOI":"10.1109\/TPAMI.2010.57","volume":"33","author":"H Jegou","year":"2011","unstructured":"Jegou, H., Douze, M., & Schmid, C. (2011). Product quantization for nearest neighbor search. PAMI, 33(1), 117\u2013128.","journal-title":"PAMI"},{"key":"1111_CR38","doi-asserted-by":"crossref","unstructured":"Jhuang, H., Gall, J., Zuffi, S., Schmid, C., & Black, Michael J. (2013). Towards understanding action recognition. In ICCV.","DOI":"10.1109\/ICCV.2013.396"},{"issue":"1","key":"1111_CR39","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2013","unstructured":"Ji, S., Wei, X., Yang, M., & Kai, Y. (2013). 3d convolutional neural networks for human action recognition. PAMI, 35(1), 221\u2013231.","journal-title":"PAMI"},{"key":"1111_CR40","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., & Fei-Fei, Li (2014). Large-scale video classification with convolutional neural networks. In CVPR.","DOI":"10.1109\/CVPR.2014.223"},{"key":"1111_CR41","unstructured":"Kay, W., Carreira, J., Simonyan, K., Zhang, B., Hillier, C., Vijayanarasimhan, S., et al. (2017). The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 ."},{"key":"1111_CR42","doi-asserted-by":"crossref","unstructured":"Klaser, A., Marsza\u0142ek, M., & Schmid, C. (2008). A spatio-temporal descriptor based on 3d-gradients. In BMVC.","DOI":"10.5244\/C.22.99"},{"key":"1111_CR43","doi-asserted-by":"crossref","unstructured":"Koniusz, P., Cherian, A., & Porikli, F. (2016). Tensor representations via kernel linearization for action recognition from 3D skeletons. In ECCV.","DOI":"10.1007\/978-3-319-46493-0_3"},{"key":"1111_CR44","unstructured":"Krizhevsky, A., Sutskever, I., & Hinton, G. E. (2012). Imagenet classification with deep convolutional neural networks. In NIPS."},{"key":"1111_CR45","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., & Serre, T. (2011). Hmdb: A large video database for human motion recognition. IEEE: In ICCV."},{"key":"1111_CR46","doi-asserted-by":"crossref","unstructured":"Lan, T., Chen, T.-C., & Savarese, S. (2014). A hierarchical representation for future action prediction. In ECCV.","DOI":"10.1007\/978-3-319-10578-9_45"},{"key":"1111_CR47","doi-asserted-by":"crossref","unstructured":"Lan, T., Zhu, Y., Zamir Roshan, A., & Savarese, S. (2015). Action recognition by hierarchical mid-level action elements. In ICCV.","DOI":"10.1109\/ICCV.2015.517"},{"issue":"2\u20133","key":"1111_CR48","doi-asserted-by":"publisher","first-page":"107","DOI":"10.1007\/s11263-005-1838-7","volume":"64","author":"I Laptev","year":"2005","unstructured":"Laptev, I. (2005). On space-time interest points. International Journal of Computer Vision, 64(2\u20133), 107\u2013123.","journal-title":"International Journal of Computer Vision"},{"key":"1111_CR49","doi-asserted-by":"crossref","unstructured":"Le, Q. V., Zou, W. Y., Yeung, S. Y., & Ng, A. Y. (2011). Learning hierarchical invariant spatio-temporal features for action recognition with independent subspace analysis. In CVPR.","DOI":"10.1109\/CVPR.2011.5995496"},{"key":"1111_CR50","doi-asserted-by":"crossref","unstructured":"Lei, J., Ren, X, & Fox, D. (2012). Fine-grained kitchen activity recognition using RGB-D. In ACM Conference on Ubiquitous Computing.","DOI":"10.1145\/2370216.2370248"},{"key":"1111_CR51","doi-asserted-by":"crossref","unstructured":"Li, P., Wang, Q., Zuo, W., & Zhang, L. (2013). Log-euclidean kernels for sparse representation and dictionary learning. In ICCV.","DOI":"10.1109\/ICCV.2013.202"},{"key":"1111_CR52","unstructured":"Monfort, M., Zhou, B., Bargal, S. A., Andonian, A., Yan, T., Ramakrishnan, K., Brown, L., Fan, Q., Gutfruend, D., Vondrick, C. et al. (2018). Moments in time dataset: One million videos for event understanding. arXiv preprint arXiv:1801.03150 ."},{"key":"1111_CR53","doi-asserted-by":"crossref","unstructured":"Newell, A., Yang, K., & Deng, J. (2016). Stacked hourglass networks for human pose estimation. In ECCV, Springer.","DOI":"10.1007\/978-3-319-46484-8_29"},{"key":"1111_CR54","doi-asserted-by":"crossref","unstructured":"Ni, B., Paramathayalan, V. R., & Moulin, P. (2014). Multiple granularity analysis for fine-grained action detection. In CVPR.","DOI":"10.1109\/CVPR.2014.102"},{"key":"1111_CR55","doi-asserted-by":"crossref","unstructured":"Oneata, D., Verbeek, J., & Schmid, C. (2013). Action and event recognition with fisher vectors on a compact feature set. In ICCV.","DOI":"10.1109\/ICCV.2013.228"},{"key":"1111_CR56","unstructured":"Pascanu, R., Mikolov, T., & Bengio, Y. (2013). On the difficulty of training recurrent neural networks. In ICML."},{"key":"1111_CR57","doi-asserted-by":"crossref","unstructured":"Peng, X., Zou, C., Qiao, Y., & Qiang, P. (2014). Action recognition with stacked fisher vectors. In ECCV, Springer.","DOI":"10.1007\/978-3-319-10602-1_38"},{"key":"1111_CR58","doi-asserted-by":"crossref","unstructured":"Peng, X., Wang, L., Wang, X., & Qiao, Y. (2016). Bag of visual words and fusion methods for action recognition: Comprehensive study and good practice. In CVIU.","DOI":"10.1016\/j.cviu.2016.03.013"},{"issue":"1","key":"1111_CR59","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1007\/s11263-005-3222-z","volume":"66","author":"X Pennec","year":"2006","unstructured":"Pennec, X., Fillard, P., & Ayache, N. (2006). A riemannian framework for tensor computing. International Journal of Computer Vision, 66(1), 41\u201366.","journal-title":"International Journal of Computer Vision"},{"key":"1111_CR60","doi-asserted-by":"crossref","unstructured":"Pirsiavash, H., & Ramanan, D. (2014). Parsing videos of actions with segmental grammars. In CVPR.","DOI":"10.1109\/CVPR.2014.85"},{"key":"1111_CR61","doi-asserted-by":"crossref","unstructured":"Pishchulin, L., Andriluka, M., & Schiele, B. (2014). Fine-grained activity recognition with holistic and pose based features. In Pattern Recognition, (pp. 678\u2013689). Springer.","DOI":"10.1007\/978-3-319-11752-2_56"},{"issue":"3","key":"1111_CR62","doi-asserted-by":"publisher","first-page":"601","DOI":"10.1109\/TPAMI.2011.158","volume":"34","author":"A Prest","year":"2012","unstructured":"Prest, A., Schmid, C., & Ferrari, V. (2012). Weakly supervised learning of interactions between humans and objects. PAMI, 34(3), 601\u2013614.","journal-title":"PAMI"},{"key":"1111_CR63","unstructured":"Ren, S,, He, K., Girshick, R., & Sun, J. (2015). Faster R-CNN: Towards real-time object detection with region proposal networks. In NIPS, (pp. 91\u201399)."},{"key":"1111_CR64","doi-asserted-by":"crossref","unstructured":"Rohrbach, M., Amin, S., Andriluka, M., & Schiele, B. (2012). A database for fine grained activity detection of cooking activities. In CVPR.","DOI":"10.1109\/CVPR.2012.6247801"},{"key":"1111_CR65","unstructured":"Rohrbach, M., Rohrbach, A., Regneri, M., Amin, S., Andriluka, M., Pinkal, M., & Schiele, B. (2015). Recognizing fine-grained and composite activities using hand-centric features and script data. arXiv preprint arXiv:1502.06648 ."},{"key":"1111_CR66","doi-asserted-by":"crossref","unstructured":"Ryoo, M. S., & Aggarwal, J. K. (2006). Recognition of composite human activities through context-free grammar based representation. In CVPR.","DOI":"10.1109\/CVPR.2006.242"},{"key":"1111_CR67","doi-asserted-by":"crossref","unstructured":"Sadanand, S., & Corso, J. J. (2012). Action bank: A high-level representation of activity in video. In CVPR.","DOI":"10.1109\/CVPR.2012.6247806"},{"key":"1111_CR68","unstructured":"Simonyan, K., & Zisserman, A. (2014). Two-stream convolutional networks for action recognition in videos. In NIPS."},{"key":"1111_CR69","unstructured":"Soomro, K., Zamir, A. R, & Shah, M. (2012). UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 ."},{"key":"1111_CR70","unstructured":"Sra, S. (2011). Positive definite matrices and the symmetric stein divergence. Technical report."},{"key":"1111_CR71","unstructured":"Srivastava, N., Mansimov, E., & Salakhutdinov, R. (2015). Unsupervised learning of video representations using LSTMs. In ICML."},{"key":"1111_CR72","doi-asserted-by":"crossref","unstructured":"Sun, C., & Nevatia, R. (2014). Discover: Discovering important segments for classification of video events and recounting. In CVPR.","DOI":"10.1109\/CVPR.2014.329"},{"key":"1111_CR73","doi-asserted-by":"crossref","unstructured":"Tang, K., Fei-Fei, L., & Koller, D. (2012). Learning latent temporal structure for complex event detection. In CVPR.","DOI":"10.1109\/CVPR.2012.6247808"},{"key":"1111_CR74","doi-asserted-by":"crossref","unstructured":"Tompson, J., Goroshin, R., Jain, A., LeCun, Y., & Bregler, C. (2015). Efficient object localization using convolutional networks. In CVPR.","DOI":"10.1109\/CVPR.2015.7298664"},{"key":"1111_CR75","unstructured":"Tompson, J. J., Jain, A., LeCun, Y., & Bregler, C. (2014). Joint training of a convolutional network and a graphical model for human pose estimation. In NIPS."},{"key":"1111_CR76","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., D., Fergus, R., Torresani, L., & Paluri, M.. (2015). Learning spatiotemporal features with 3D convolutional networks. In ICCV.","DOI":"10.1109\/ICCV.2015.510"},{"issue":"3","key":"1111_CR77","doi-asserted-by":"publisher","first-page":"480","DOI":"10.1109\/TPAMI.2011.153","volume":"34","author":"A Vedaldi","year":"2012","unstructured":"Vedaldi, A., & Zisserman, A. (2012). Efficient additive kernels via explicit feature maps. PAMI, 34(3), 480\u2013492.","journal-title":"PAMI"},{"key":"1111_CR78","doi-asserted-by":"crossref","unstructured":"Wang, C., Wang, Y., & Yuille, A. L. (2013a). An approach to pose-based action recognition. In CVPR.","DOI":"10.1109\/CVPR.2013.123"},{"key":"1111_CR79","doi-asserted-by":"crossref","unstructured":"Wang, H, & Schmid, C. (2013). Action recognition with improved trajectories. In ICCV.","DOI":"10.1109\/ICCV.2013.441"},{"issue":"1","key":"1111_CR80","doi-asserted-by":"publisher","first-page":"60","DOI":"10.1007\/s11263-012-0594-8","volume":"103","author":"H Wang","year":"2013","unstructured":"Wang, H., Kl\u00e4ser, A., Schmid, C., & Liu, C.-L. (2013b). Dense trajectories and motion boundary descriptors for action recognition. IJCV, 103(1), 60\u201379.","journal-title":"IJCV"},{"key":"1111_CR81","doi-asserted-by":"crossref","unstructured":"Wang, J., Cherian, A., & Porikli, F. (2017). Ordered pooling of optical flow sequences for action recognition. In WACV.","DOI":"10.1109\/WACV.2017.26"},{"key":"1111_CR82","doi-asserted-by":"crossref","unstructured":"Wang, J., Cherian, A., Porikli, F., & Gould, S. (2018). Video representation learning using discriminative pooling. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, (pp. 1149\u20131158).","DOI":"10.1109\/CVPR.2018.00126"},{"key":"1111_CR83","doi-asserted-by":"crossref","unstructured":"Wang, L., Qiao, Y., & Tang, X. (2015). Action recognition with trajectory-pooled deep-convolutional descriptors. In CVPR.","DOI":"10.1109\/CVPR.2015.7299059"},{"key":"1111_CR84","doi-asserted-by":"crossref","unstructured":"Wang, L., Xiong, Y., Wang, Z., Qiao, Y., Lin, D., Tang, X., & Van Gool, L. (2016). Temporal segment networks: Towards good practices for deep action recognition. In ECCV.","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"1111_CR85","doi-asserted-by":"crossref","unstructured":"Wei, S.-E., Ramakrishna, V., Kanade, T., & Sheikh, Y. (2016). Convolutional pose machines. In CVPR.","DOI":"10.1109\/CVPR.2016.511"},{"key":"1111_CR86","doi-asserted-by":"crossref","unstructured":"Wu, C., Zhang, J., Savarese, S., & Saxena, A. (2015). Watch-n-patch: Unsupervised understanding of actions and relations. In CVPR.","DOI":"10.1109\/CVPR.2015.7299065"},{"key":"1111_CR87","doi-asserted-by":"crossref","unstructured":"Yao, A., Gall, J., Fanelli, G., & Van Gool, L. J. (2011a). Does human action recognition benefit from pose estimation?. In BMVC.","DOI":"10.5244\/C.25.67"},{"key":"1111_CR88","unstructured":"Yao, B., & Fei-Fei, L. (2012). Action recognition with exemplar based 2.5 d graph matching. In ECCV."},{"key":"1111_CR89","doi-asserted-by":"crossref","unstructured":"Yao, B., Jiang, X., Khosla, A., Lin, A. L., Guibas, L., & Fei-Fei, L. (2011b). Human action recognition by learning bases of action attributes and parts. In ICCV.","DOI":"10.1109\/ICCV.2011.6126386"},{"key":"1111_CR90","unstructured":"Yu, K., & Salzmann, M. (2017). Second-order convolutional neural networks. arXiv preprint arXiv:1703.06817 ."},{"key":"1111_CR91","unstructured":"Yuan, C., Hu, W., Li, X., Maybank, S., & Luo, G. (2009). Human action recognition under log-euclidean riemannian metric. In ACCV."},{"key":"1111_CR92","doi-asserted-by":"crossref","unstructured":"Yue-Hei Ng, J., Hausknecht, M., Vijayanarasimhan, S., Vinyals, O., Monga, R., & Toderici, G. (2015). Beyond short snippets: Deep networks for video classification. In CVPR.","DOI":"10.1109\/CVPR.2015.7299101"},{"key":"1111_CR93","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Ni, B., Yan, S., Moulin, P., & Tian, Q. (2014). Pipelining localized semantic features for fine-grained action recognition. In ECCV.","DOI":"10.1007\/978-3-319-10593-2_32"},{"key":"1111_CR94","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Ni, B., Hong, R., Wang, M., & Tian, Q. (2015). Interaction part mining: A mid-level approach for fine-grained action recognition. In CVPR.","DOI":"10.1109\/CVPR.2015.7298953"},{"key":"1111_CR95","unstructured":"Zisserman, A., Carreira, J., Simonyan, K., Kay, W., Zhang, B., Hillier, C., Vijayanarasimhan, S., Viola, F., Green, T., Back, T. et al. (2017). The kinetics human action video dataset."},{"issue":"3","key":"1111_CR96","doi-asserted-by":"publisher","first-page":"437","DOI":"10.1007\/s11263-012-0549-0","volume":"101","author":"S Zuffi","year":"2013","unstructured":"Zuffi, S., & Black, M. J. (2013). Puppet flow. IJCV, 101(3), 437\u2013458.","journal-title":"IJCV"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-018-1111-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11263-018-1111-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-018-1111-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,29]],"date-time":"2022-08-29T19:37:30Z","timestamp":1661801850000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11263-018-1111-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,8,19]]},"references-count":96,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2019,4]]}},"alternative-id":["1111"],"URL":"https:\/\/doi.org\/10.1007\/s11263-018-1111-5","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018,8,19]]},"assertion":[{"value":"23 April 2017","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 August 2018","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 August 2018","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}