{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,20]],"date-time":"2025-10-20T10:22:11Z","timestamp":1760955731924,"version":"3.37.3"},"reference-count":71,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2017,6,24]],"date-time":"2017-06-24T00:00:00Z","timestamp":1498262400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2017,6,24]],"date-time":"2017-06-24T00:00:00Z","timestamp":1498262400000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100000923","name":"Australian Research Council","doi-asserted-by":"publisher","award":["CE140100016"],"award-info":[{"award-number":["CE140100016"]}],"id":[{"id":"10.13039\/501100000923","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2017,9]]},"DOI":"10.1007\/s11263-017-1030-x","type":"journal-article","created":{"date-parts":[[2017,6,24]],"date-time":"2017-06-24T10:57:17Z","timestamp":1498301837000},"page":"335-355","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":14,"title":["Discriminatively Learned Hierarchical Rank Pooling Networks"],"prefix":"10.1007","volume":"124","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6920-9916","authenticated-orcid":false,"given":"Basura","family":"Fernando","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Stephen","family":"Gould","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2017,6,24]]},"reference":[{"key":"1030_CR1","unstructured":"Abu-El-Haija, S., Kothari, N., Lee, J., Natsev, P., Toderici, G., Varadarajan, B., & Vijayanarasimhan, S. (2016). Youtube-8m: A large-scale video classification benchmark. \n                    arXiv:1609.08675\n                    \n                  ."},{"key":"1030_CR2","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4757-2836-1","volume-title":"Practical bilevel optimization: Algorithms and applications","author":"JF Bard","year":"1998","unstructured":"Bard, J. F. (1998). Practical bilevel optimization: Algorithms and applications. Dordrecht: Kluwer Academic Press."},{"key":"1030_CR3","unstructured":"Bilen, H., Fernando, B., Gavves, E., & Vedaldi, A. (2016). Action recognition with dynamic image networks. \n                    arXiv:1612.00738\n                    \n                  ."},{"key":"1030_CR4","doi-asserted-by":"crossref","unstructured":"Bilen, H., Fernando, B., Gavves, E., Vedaldi, A., & Gould, S. (2016). Dynamic image networks for action recognition. In CVPR.","DOI":"10.1109\/CVPR.2016.331"},{"key":"1030_CR5","first-page":"499","volume":"2","author":"O Bousquet","year":"2002","unstructured":"Bousquet, O., & Elisseeff, A. (2002). Stability and generalization. JMLR, 2, 499\u2013526.","journal-title":"JMLR"},{"key":"1030_CR6","doi-asserted-by":"crossref","unstructured":"Bregler, C. (1997). Learning and recognizing human dynamics in video sequences. In CVPR, IEEE (pp. 568\u2013574).","DOI":"10.1109\/CVPR.1997.609382"},{"key":"1030_CR7","doi-asserted-by":"crossref","unstructured":"Brox, T., Bruhn, A., Papenberg, N., & Weickert, J. (2004). High accuracy optical flow estimation based on a theory for warping. In ECCV.","DOI":"10.1007\/978-3-540-24673-2_3"},{"issue":"3","key":"1030_CR8","first-page":"27","volume":"2","author":"C-C Chang","year":"2011","unstructured":"Chang, C.-C., & Lin, C.-J. (2011). Libsvm: A library for support vector machines. ACM Transactions on Intelligent Systems and Technology (TIST), 2(3), 27.","journal-title":"ACM Transactions on Intelligent Systems and Technology (TIST)"},{"key":"1030_CR9","unstructured":"Chollet, F. (2015). Keras."},{"issue":"3","key":"1030_CR10","doi-asserted-by":"publisher","first-page":"685","DOI":"10.1007\/s10589-015-9795-8","volume":"63","author":"S Dempe","year":"2016","unstructured":"Dempe, S., & Franke, S. (2016). On the solution of convex bilevel optimization problems. Computational Optimization and Applications, 63(3), 685\u2013703.","journal-title":"Computational Optimization and Applications"},{"key":"1030_CR11","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., & Fei-Fei, L. (2009). ImageNet: A large-scale hierarchical image database. In CVPR.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"1030_CR12","unstructured":"Do, C.B., Foo, C.-S., & Ng, A.Y. (2007). Efficient multiple hyperparameter learning for log-linear models. In NIPS"},{"key":"1030_CR13","unstructured":"Domke, J. (2012). Generic methods for optimization-based modeling. In AISTATS."},{"key":"1030_CR14","unstructured":"Du, Y., Wang, W., & Wang, L. (2015). Hierarchical recurrent neural network for skeleton based action recognition. In CVPR."},{"key":"1030_CR15","first-page":"1871","volume":"9","author":"R-E Fan","year":"2008","unstructured":"Fan, R.-E., Chang, K.-W., Hsieh, C.-J., Wang, X.-R., & Lin, C.-J. (2008). Liblinear: A library for large linear classification. Journal of Machine Learning Research, 9, 1871\u20131874.","journal-title":"Journal of Machine Learning Research"},{"key":"1030_CR16","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Pinz, A., & Zisserman, A. (2016). Convolutional two-stream network fusion for video action recognition. In CVPR","DOI":"10.1109\/CVPR.2016.213"},{"issue":"99","key":"1030_CR17","first-page":"1","volume":"PP","author":"B Fernando","year":"2016","unstructured":"Fernando, B., Gavves, E., Oramas, J., Ghodrati, A., & Tuytelaars, T. (2016). Rank pooling for action recognition. TPAMI, PP(99), 1\u20131.","journal-title":"TPAMI"},{"key":"1030_CR18","doi-asserted-by":"crossref","unstructured":"Fernando, B., Anderson, P., Hutter, M., & Gould, S. (2016). Discriminative hierarchical rank pooling for activity recognition. In CVPR.","DOI":"10.1109\/CVPR.2016.212"},{"key":"1030_CR19","doi-asserted-by":"crossref","unstructured":"Fernando, B., Gavves, E., Oramas, J., Ghodrati, A., & Tuytelaars, T. (2015). Modeling video evolution for action recognition. In CVPR.","DOI":"10.1109\/CVPR.2015.7299176"},{"key":"1030_CR20","unstructured":"Fernando, B., & Gould, S. (2016). Learning end-to-end video classification with rank-pooling. In ICML."},{"key":"1030_CR21","unstructured":"Fox, E., Jordan, M.I., Sudderth, E.B., & Willsky, A.S. (2009). Sharing features among dynamical systems with beta processes. In NIPS (pp. 549\u2013557)."},{"key":"1030_CR22","doi-asserted-by":"crossref","unstructured":"Girshick, R., Donahue, J., Darrell, T., & Malik, J. (2014). Rich feature hierarchies for accurate object detection and semantic segmentation. In CVPR.","DOI":"10.1109\/CVPR.2014.81"},{"key":"1030_CR23","volume-title":"Matrix computations","author":"Gene\u00a0H Golub","year":"1996","unstructured":"Golub, Gene\u00a0 H, & Van Loan, Charles F. (1996). Matrix computations (3rd ed.). Baltimore: Johns Hopkins University Press.","edition":"3"},{"key":"1030_CR24","unstructured":"Gould, S., Fernando, B., Cherian, A., Anderson, P., Cruz, R.S., & Guo, E. (2016). On differentiating parameterized argmin and argmax problems with application to bi-level optimization. 1(1):1. \n                    arXiv:1607.05447\n                    \n                  ."},{"key":"1030_CR25","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In CVPR.","DOI":"10.1109\/CVPR.2016.90"},{"key":"1030_CR26","unstructured":"Hoai, M., & Zisserman, A. (2014). Improving human action recognition using score distribution and ranking. In ACCV."},{"issue":"8","key":"1030_CR27","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural Computation, 9(8), 1735\u20131780.","journal-title":"Neural Computation"},{"key":"1030_CR28","doi-asserted-by":"crossref","unstructured":"Hughes, M.C., & Sudderth, E.B. (2012). Nonparametric discovery of activity patterns from video collections. In CVPR Workshops (pp. 25\u201332).","DOI":"10.1109\/CVPRW.2012.6239170"},{"key":"1030_CR29","doi-asserted-by":"crossref","unstructured":"Jain, M., J\u00e9gou, H., & Bouthemy, P. (2013). Better exploiting motion for better action recognition. In CVPR.","DOI":"10.1109\/CVPR.2013.330"},{"key":"1030_CR30","doi-asserted-by":"crossref","unstructured":"J\u00e9gou, H., Douze, M., Schmid, C., & P\u00e9rez, P. (2010). Aggregating local descriptors into a compact image representation. In CVPR, IEEE (pp. 3304\u20133311).","DOI":"10.1109\/CVPR.2010.5540039"},{"issue":"1","key":"1030_CR31","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2013","unstructured":"Ji, S., Xu, W., Yang, M., & Yu, K. (2013). 3d convolutional neural networks for human action recognition. PAMI, 35(1), 221\u2013231.","journal-title":"PAMI"},{"key":"1030_CR32","doi-asserted-by":"crossref","unstructured":"Jia, Y., Shelhamer, E., Donahue, J., Karayev, S., Long, J., Girshick, R., Guadarrama, S., & Darrell, T. (2014). Caffe: Convolutional architecture for fast feature embedding. In Proceedings of the ACM International Conference on Multimedia (pp. 675\u2013678). ACM.","DOI":"10.1145\/2647868.2654889"},{"key":"1030_CR33","doi-asserted-by":"crossref","unstructured":"Joachims, T. (2006). Training linear svms in linear time. In ICKDD.","DOI":"10.1145\/1150402.1150429"},{"key":"1030_CR34","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., & Fei-Fei, L. (2014). Large-scale video classification with convolutional neural networks. In CVPR.","DOI":"10.1109\/CVPR.2014.223"},{"key":"1030_CR35","unstructured":"Kay, W., Carreira, J., Simonyan, K., Zhang, B., Hillier, C., Vijayanarasimhan, S., Viola, F., Green, T., Back, T., & Natsev, P. et\u00a0al. (2017). The kinetics human action video dataset. \n                    arXiv:1705.06950\n                    \n                  ."},{"key":"1030_CR36","unstructured":"Klatzer, T., & Pock, T. (2015). Continuous hyper-parameter learning for support vector machines. In Computer Vision Winter Workshop (CVWW)."},{"key":"1030_CR37","unstructured":"Krizhevsky, A., Sutskever, I., & Hinton, G.E. (2012). Imagenet classification with deep convolutional neural networks. In NIPS (pp. 1097\u20131105)."},{"key":"1030_CR38","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., & Serre, T. (2011). Hmdb: A large video database for human motion recognition. In ICCV.","DOI":"10.1109\/ICCV.2011.6126543"},{"issue":"2","key":"1030_CR39","doi-asserted-by":"publisher","first-page":"938","DOI":"10.1137\/120882706","volume":"6","author":"K Kunisch","year":"2013","unstructured":"Kunisch, K., & Pock, T. (2013). A bilevel optimization approach for parameter learning in variational models. SIAM Journal on Imaging Sciences, 6(2), 938\u2013983.","journal-title":"SIAM Journal on Imaging Sciences"},{"key":"1030_CR40","unstructured":"Lan, T., Zhu, Y., Roshan Zamir, A. & Savarese, S. (2015). In ICCV: Action recognition by hierarchical mid-level action elements. In ICCV."},{"key":"1030_CR41","unstructured":"Lan, Z., Lin, M., Li, X., Hauptmann, A.G, & Raj, B. (2015). Beyond gaussian pyramid: Multi-skip feature stacking for action recognition. In CVPR."},{"key":"1030_CR42","doi-asserted-by":"crossref","unstructured":"Laptev, I., Marszalek, M., Schmid, C., & Rozenfeld, B. (2008). Learning realistic human actions from movies. In CVPR.","DOI":"10.1109\/CVPR.2008.4587756"},{"key":"1030_CR43","doi-asserted-by":"crossref","unstructured":"Li, Y., Li, W., Mahadevan, V., & Vasconcelos, N. (2016). Vlad3: Encoding dynamics of deep features for action recognition. In CVPR.","DOI":"10.1109\/CVPR.2016.215"},{"issue":"3","key":"1030_CR44","doi-asserted-by":"publisher","first-page":"225","DOI":"10.1561\/1500000016","volume":"3","author":"T-Y Liu","year":"2009","unstructured":"Liu, T.-Y. (2009). Learning to rank for information retrieval. Foundations and Trends in Information Retrieval, 3(3), 225\u2013331.","journal-title":"Foundations and Trends in Information Retrieval"},{"issue":"7","key":"1030_CR45","doi-asserted-by":"publisher","first-page":"504","DOI":"10.1109\/TSA.2002.804546","volume":"10","author":"L Lu","year":"2002","unstructured":"Lu, L., Zhang, H.-J., & Jiang, H. (2002). Content analysis for audio classification and segmentation. IEEE Transactions on Speech and Audio Processing, 10(7), 504\u2013516.","journal-title":"IEEE Transactions on Speech and Audio Processing"},{"key":"1030_CR46","unstructured":"Ng, J.Y.-H., Hausknecht, M., Vijayanarasimhan, S., Vinyals, O., Monga, R. & Toderici, G. (2015). Beyond short snippets: Deep networks for video classification. In CVPR."},{"key":"1030_CR47","doi-asserted-by":"crossref","unstructured":"Ochs, P., Ranftl, R., Brox, T., & Pock, T. (2015). Bilevel optimization with nonsmooth lower level problems. In International Conference on Scale Space and Variational Methods in Computer Vision (SSVM) (pp. 654\u2013665).","DOI":"10.1007\/978-3-319-18461-6_52"},{"key":"1030_CR48","doi-asserted-by":"crossref","unstructured":"Peng, X., Zou, C., Qiao, Y., & Peng, Q. (2014). Action recognition with stacked fisher vectors. In ECCV.","DOI":"10.1007\/978-3-319-10602-1_38"},{"key":"1030_CR49","doi-asserted-by":"crossref","unstructured":"Perronnin, F., Liu, Y., S\u00e1nchez, J., & Poirier, H. (2010). Large-scale image retrieval with compressed fisher vectors. In CVPR.","DOI":"10.1109\/CVPR.2010.5540009"},{"issue":"6","key":"1030_CR50","doi-asserted-by":"publisher","first-page":"976","DOI":"10.1016\/j.imavis.2009.11.014","volume":"28","author":"R Poppe","year":"2010","unstructured":"Poppe, R. (2010). A survey on vision-based human action recognition. Image and Vision Computing, 28(6), 976\u2013990.","journal-title":"Image and Vision Computing"},{"key":"1030_CR51","doi-asserted-by":"crossref","unstructured":"Rodriguez, M.D., Ahmed, J. & Shah, M. (2008). Action mach a spatio-temporal maximum average correlation height filter for action recognition. In CVPR.","DOI":"10.1109\/CVPR.2008.4587727"},{"key":"1030_CR52","doi-asserted-by":"crossref","unstructured":"Ryoo, M.S., Rothrock, B., & Matthies, L. (June 2015). Pooled motion features for first-person videos. In CVPR.","DOI":"10.1109\/CVPR.2015.7298691"},{"key":"1030_CR53","doi-asserted-by":"crossref","unstructured":"Samuel, K.G. G., & Tappen, M.F. (2009). Learning optimized MAP estimates in continuously-valued MRF models. In CVPR.","DOI":"10.1109\/CVPR.2009.5206774"},{"key":"1030_CR54","doi-asserted-by":"crossref","unstructured":"Sener, O., Zamir, A.R., Savarese, S., & Saxena, A. (2015). Unsupervised semantic parsing of video collections. In ICCV (pp. 4480\u20134488).","DOI":"10.1109\/ICCV.2015.509"},{"issue":"5","key":"1030_CR55","doi-asserted-by":"publisher","first-page":"410","DOI":"10.1016\/S1369-5266(03)00092-X","volume":"6","author":"K Shinozaki","year":"2003","unstructured":"Shinozaki, K., Yamaguchi-Shinozaki, K., & Seki, M. (2003). Regulatory network of gene expression in the drought and cold stress responses. Current Opinion in Plant Biology, 6(5), 410\u2013417.","journal-title":"Current Opinion in Plant Biology"},{"key":"1030_CR56","unstructured":"Simonyan, K. & Zisserman, A. (2014). Two-stream convolutional networks for action recognition in videos. In NIPS (pp. 568\u2013576)."},{"key":"1030_CR57","unstructured":"Simonyan, K. & Zisserman, A. (2014). Very deep convolutional networks for large-scale image recognition. 1(1):1. \n                    arXiv:1409.1556\n                    \n                  ."},{"key":"1030_CR58","unstructured":"Snoek, C., Ghanem, B., & Niebles, J.C. (2016). The activitynet large scale activity recognition challenge."},{"key":"1030_CR59","doi-asserted-by":"crossref","unstructured":"Song, Y., Morency, L.-P. & Davis, R. (2013). Action recognition by hierarchical sequence summarization. In CVPR.","DOI":"10.1109\/CVPR.2013.457"},{"key":"1030_CR60","unstructured":"Soomro, K., Zamir, A.R., & Shah, M. (2012). Ucf101: A dataset of 101 human actions classes from videos in the wild. 1(1):1. \n                    arXiv:1212.0402\n                    \n                  ."},{"key":"1030_CR61","unstructured":"Srivastava, N., Mansimov, E., & Salakhutdinov, R. (2015). Unsupervised learning of video representations using lstms. 1(1):1. \n                    arXiv:1502.04681\n                    \n                  ."},{"key":"1030_CR62","doi-asserted-by":"crossref","unstructured":"Sun, L., Jia, K., Yeung, D.-Y. & Shi, B.E. (2015). Human action recognition using factorized spatio-temporal convolutional networks. In The IEEE International Conference on Computer Vision (ICCV).","DOI":"10.1109\/ICCV.2015.522"},{"key":"1030_CR63","unstructured":"Sutskever, I., Vinyals, O., & Le Q.VV. (2014) Sequence to sequence learning with neural networks. In NIPS (pp 3104\u20133112)."},{"key":"1030_CR64","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., & Paluri, M. (2015). Learning spatiotemporal features with 3d convolutional networks. In ICCV.","DOI":"10.1109\/ICCV.2015.510"},{"key":"1030_CR65","unstructured":"Vedaldi, A., & Lenc, K. (2015). Matconvnet\u2013convolutional neural networks for matlab. In Proceeding of the ACM International Conference on Multimedia."},{"key":"1030_CR66","doi-asserted-by":"crossref","unstructured":"Veeriah, V., Zhuang, N., & Qi, G.-J. (2015). Differential recurrent neural networks for action recognition. In The IEEE International Conference on Computer Vision (ICCV).","DOI":"10.1109\/ICCV.2015.460"},{"key":"1030_CR67","doi-asserted-by":"publisher","first-page":"60","DOI":"10.1007\/s11263-012-0594-8","volume":"103","author":"W Heng","year":"2013","unstructured":"Heng, W., Kl\u00e4ser, A., Schmid, C., & Liu, C.-L. (2013). Dense trajectories and motion boundary descriptors for action recognition. IJCV, 103, 60\u201379.","journal-title":"IJCV"},{"key":"1030_CR68","doi-asserted-by":"crossref","unstructured":"Wang, H. & Schmid, C. (2013). Action recognition with improved trajectories. In ICCV.","DOI":"10.1109\/ICCV.2013.441"},{"key":"1030_CR69","doi-asserted-by":"crossref","unstructured":"Wang, L., Qiao, Y., & Tang, X. (2015). Action recognition with trajectory-pooled deep-convolutional descriptors. In CVPR (pp. 4305\u20134314).","DOI":"10.1109\/CVPR.2015.7299059"},{"key":"1030_CR70","doi-asserted-by":"crossref","unstructured":"Wu, J., Zhang, Y., & Lin, W. (2014). Towards good practices for action video encoding. In CVPR.","DOI":"10.1109\/CVPR.2014.330"},{"key":"1030_CR71","doi-asserted-by":"crossref","unstructured":"Zha, S., Luisier, F., Andrews, W., Srivastava, N., & Salakhutdinov, R. (2015). Exploiting image-trained CNN architectures for unconstrained video classification. In BMVC.","DOI":"10.5244\/C.29.60"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11263-017-1030-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-017-1030-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-017-1030-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,5,17]],"date-time":"2020-05-17T07:16:22Z","timestamp":1589699782000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11263-017-1030-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,6,24]]},"references-count":71,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2017,9]]}},"alternative-id":["1030"],"URL":"https:\/\/doi.org\/10.1007\/s11263-017-1030-x","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"type":"print","value":"0920-5691"},{"type":"electronic","value":"1573-1405"}],"subject":[],"published":{"date-parts":[[2017,6,24]]},"assertion":[{"value":"13 September 2016","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 June 2017","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 June 2017","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}