{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,26]],"date-time":"2026-03-26T15:47:21Z","timestamp":1774540041206,"version":"3.50.1"},"reference-count":71,"publisher":"Springer Science and Business Media LLC","issue":"10","license":[{"start":{"date-parts":[[2022,8,7]],"date-time":"2022-08-07T00:00:00Z","timestamp":1659830400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,8,7]],"date-time":"2022-08-07T00:00:00Z","timestamp":1659830400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2022,10]]},"DOI":"10.1007\/s11263-022-01661-1","type":"journal-article","created":{"date-parts":[[2022,8,7]],"date-time":"2022-08-07T13:02:10Z","timestamp":1659877330000},"page":"2453-2471","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":50,"title":["EAN: Event Adaptive Network for Enhanced Action Recognition"],"prefix":"10.1007","volume":"130","author":[{"given":"Yuan","family":"Tian","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3209-8965","authenticated-orcid":false,"given":"Yichao","family":"Yan","sequence":"additional","affiliation":[]},{"given":"Guangtao","family":"Zhai","sequence":"additional","affiliation":[]},{"given":"Guodong","family":"Guo","sequence":"additional","affiliation":[]},{"given":"Zhiyong","family":"Gao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,8,7]]},"reference":[{"key":"1661_CR1","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., & Schmid, C. (2021). Vivit: A video vision transformer. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 6836\u20136846).","DOI":"10.1109\/ICCV48922.2021.00676"},{"issue":"3","key":"1661_CR2","doi-asserted-by":"publisher","first-page":"502","DOI":"10.1007\/s11263-016-0934-1","volume":"122","author":"R Bensch","year":"2017","unstructured":"Bensch, R., Scherf, N., Huisken, J., Brox, T., & Ronneberger, O. (2017). Spatiotemporal deformable prototypes for motion anomaly detection. International Journal of Computer Vision, 122(3), 502\u2013523.","journal-title":"International Journal of Computer Vision"},{"key":"1661_CR3","unstructured":"Bertasius, G., Feichtenhofer, C., Tran, D., Shi, J., & Torresani, L. (2018). Learning discriminative motion features through detection. arXiv:1812.04172"},{"key":"1661_CR4","unstructured":"Bertasius, G., Wang, H., &Torresani, L. (2021). Is space-time attention all you need for video understanding? arXiv:2102.05095"},{"key":"1661_CR5","unstructured":"Bulat, A., Perez Rua, J. M., Sudhakaran, S., Martinez, B., & Tzimiropoulos, G. (2021). Space-time mixing attention for video transformer. Advances in Neural Information Processing Systems, 34"},{"key":"1661_CR6","doi-asserted-by":"crossref","unstructured":"Carreira, J., & Zisserman, A. (2017). Quo vadis, action recognition? A new model and the kinetics dataset. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 6299\u20136308).","DOI":"10.1109\/CVPR.2017.502"},{"issue":"10","key":"1661_CR7","doi-asserted-by":"publisher","first-page":"2846","DOI":"10.1007\/s11263-021-01486-4","volume":"129","author":"X Chen","year":"2021","unstructured":"Chen, X., Pang, A., Yang, W., Ma, Y., Xu, L., & Yu, J. (2021). Sportscap: Monocular 3d human motion capture and fine-grained understanding in challenging sports videos. International Journal of Computer Vision, 129(10), 2846\u20132864.","journal-title":"International Journal of Computer Vision"},{"key":"1661_CR8","doi-asserted-by":"crossref","unstructured":"Chen, Y., Dai, X., Liu, M., Chen, D., Yuan, L., & Liu, Z. (2020). Dynamic convolution: Attention over convolution kernels. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 11030\u201311039).","DOI":"10.1109\/CVPR42600.2020.01104"},{"issue":"4","key":"1661_CR9","doi-asserted-by":"publisher","first-page":"340","DOI":"10.1007\/s11263-018-1111-5","volume":"127","author":"A Cherian","year":"2019","unstructured":"Cherian, A., & Gould, S. (2019). Second-order temporal pooling for action recognition. International Journal of Computer Vision, 127(4), 340\u2013362.","journal-title":"International Journal of Computer Vision"},{"key":"1661_CR10","doi-asserted-by":"crossref","unstructured":"Cong, Y., Liao, W., Ackermann, H., Rosenhahn, B., & Yang, M. Y. (2021). Spatial-temporal transformer for dynamic scene graph generation. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 16372\u201316382).","DOI":"10.1109\/ICCV48922.2021.01606"},{"key":"1661_CR11","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L. J., Li, K., & Fei-Fei, L. (2009). Imagenet: A large-scale hierarchical image database. In 2009 IEEE conference on computer vision and pattern recognition (pp. 248\u2013255). Ieee.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"1661_CR12","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., & Gelly, S., et\u00a0al. (2020). An image is worth 16x16 words: Transformers for image recognition at scale. arXiv:2010.11929"},{"key":"1661_CR13","doi-asserted-by":"crossref","unstructured":"Fan, H., Xiong, B., Mangalam, K., Li, Y., Yan, Z., Malik, J., & Feichtenhofer, C. (2021). Multiscale vision transformers. arXiv:2104.11227","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"1661_CR14","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., & He, K. (2019). Slowfast networks for video recognition. In Proceedings of the IEEE international conference on computer vision (pp. 6202\u20136211).","DOI":"10.1109\/ICCV.2019.00630"},{"issue":"2","key":"1661_CR15","doi-asserted-by":"publisher","first-page":"420","DOI":"10.1007\/s11263-019-01225-w","volume":"128","author":"C Feichtenhofer","year":"2020","unstructured":"Feichtenhofer, C., Pinz, A., Wildes, R. P., & Zisserman, A. (2020). Deep insights into convolutional networks for video recognition. International Journal of Computer Vision, 128(2), 420\u2013437.","journal-title":"International Journal of Computer Vision"},{"key":"1661_CR16","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Pinz, A., & Zisserman, A. (2016). Convolutional two-stream network fusion for video action recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1933\u20131941).","DOI":"10.1109\/CVPR.2016.213"},{"issue":"2","key":"1661_CR17","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1023\/A:1008155721192","volume":"37","author":"JM Ferryman","year":"2000","unstructured":"Ferryman, J. M., Maybank, S. J., & Worrall, A. D. (2000). Visual surveillance for moving vehicles. International Journal of Computer Vision, 37(2), 187\u2013197.","journal-title":"International Journal of Computer Vision"},{"issue":"2","key":"1661_CR18","doi-asserted-by":"publisher","first-page":"652","DOI":"10.1109\/TPAMI.2019.2938758","volume":"43","author":"SH Gao","year":"2019","unstructured":"Gao, S. H., Cheng, M. M., Zhao, K., Zhang, X. Y., Yang, M. H., & Torr, P. (2019). Res2net: A new multi-scale backbone architecture. IEEE Transactions on Pattern Analysis and Machine Intelligence, 43(2), 652\u2013662.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1661_CR19","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Carreira, J., Doersch, C., & Zisserman, A. (2019). Video action transformer network. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 244\u2013253).","DOI":"10.1109\/CVPR.2019.00033"},{"key":"1661_CR20","doi-asserted-by":"crossref","unstructured":"Girdhar, R., & Grauman, K. (2021). Anticipative video transformer. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 13505\u201313515).","DOI":"10.1109\/ICCV48922.2021.01325"},{"key":"1661_CR21","doi-asserted-by":"crossref","unstructured":"Goyal, R., Kahou, S. E., Michalski, V., Materzynska, J., Westphal, S., Kim, H., et al. (2017). The\u201csomething something\u201d video database for learning and evaluating visual common sense. Proceedings of the IEEE International Conference on Computer Vision, 1, 5.","DOI":"10.1109\/ICCV.2017.622"},{"key":"1661_CR22","doi-asserted-by":"crossref","unstructured":"Hara, K., Kataoka, H., & Satoh, Y. (2017). Learning spatio-temporal features with 3d residual networks for action recognition. In Proceedings of the IEEE international conference on computer vision workshops (pp. 3154\u20133160).","DOI":"10.1109\/ICCVW.2017.373"},{"key":"1661_CR23","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 770\u2013778).","DOI":"10.1109\/CVPR.2016.90"},{"key":"1661_CR24","doi-asserted-by":"crossref","unstructured":"Ilg, E., Mayer, N., Saikia, T., Keuper, M., Dosovitskiy, A., & Brox, T. (2017). Flownet 2.0: Evolution of optical flow estimation with deep networks. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 2462\u20132470).","DOI":"10.1109\/CVPR.2017.179"},{"key":"1661_CR25","first-page":"667","volume":"29","author":"X Jia","year":"2016","unstructured":"Jia, X., De Brabandere, B., Tuytelaars, T., & Gool, L. V. (2016). Dynamic filter networks. Advances in Neural Information Processing Systems, 29, 667\u2013675.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"1661_CR26","doi-asserted-by":"crossref","unstructured":"Jiang, B., Wang, M., Gan, W., Wu, W., & Yan, J. (2019). Stm: Spatiotemporal and motion encoding for action recognition. In Proceedings of the IEEE international conference on computer vision (pp. 2000\u20132009).","DOI":"10.1109\/ICCV.2019.00209"},{"key":"1661_CR27","doi-asserted-by":"crossref","unstructured":"Kanojia, G., Kumawat, S., & Raman, S. (2019). Attentive spatio-temporal representation learning for diving classification. In Proceedings of the IEEE conference on computer vision and pattern recognition workshops.","DOI":"10.1109\/CVPRW.2019.00302"},{"key":"1661_CR28","doi-asserted-by":"crossref","unstructured":"Khowaja, S. A., & Lee, S. L. (2020). Semantic image networks for human action recognition. International Journal of Computer Vision.","DOI":"10.1007\/s11263-019-01248-3"},{"key":"1661_CR29","doi-asserted-by":"crossref","unstructured":"Kwon, H., Kim, M., Kwak, S., & Cho, M. (2020). Motionsqueeze: Neural motion feature learning for video understanding. In European conference on computer vision (pp. 345\u2013362). Springer.","DOI":"10.1007\/978-3-030-58517-4_21"},{"key":"1661_CR30","doi-asserted-by":"crossref","unstructured":"Li, Y., Ji, B., Shi, X., Zhang, J., Kang, B., & Wang, L. (2020). Tea: Temporal excitation and aggregation for action recognition. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 909\u2013918).","DOI":"10.1109\/CVPR42600.2020.00099"},{"key":"1661_CR31","doi-asserted-by":"crossref","unstructured":"Li, Y., Li, Y., & Vasconcelos, N. (2018). Resound: Towards action recognition without representation bias. In Proceedings of the European conference on computer vision (pp. 513\u2013528).","DOI":"10.1007\/978-3-030-01231-1_32"},{"key":"1661_CR32","doi-asserted-by":"crossref","unstructured":"Lin, J., Gan, C., & Han, S. (2019). Tsm: Temporal shift module for efficient video understanding. In Proceedings of the IEEE international conference on computer vision (pp. 7083\u20137093).","DOI":"10.1109\/ICCV.2019.00718"},{"key":"1661_CR33","doi-asserted-by":"crossref","unstructured":"Liu, Z., Luo, D., Wang, Y., Wang, L., Tai, Y., Wang, C., Li, J., Huang, F., & Lu, T. (2020). Teinet: Towards an efficient architecture for video recognition. In Proceedings of the AAAI conference on artificial intelligence (Vol. 34, pp. 11669\u201311676).","DOI":"10.1609\/aaai.v34i07.6836"},{"key":"1661_CR34","doi-asserted-by":"crossref","unstructured":"Liu, Z., Wang, L., Wu, W., Qian, C., & Lu, T. (2020). Tam: Temporal adaptive module for video recognition. arXiv:2005.06803","DOI":"10.1109\/ICCV48922.2021.01345"},{"issue":"8","key":"1661_CR35","doi-asserted-by":"publisher","first-page":"993","DOI":"10.1007\/s11263-018-1129-8","volume":"127","author":"C Lu","year":"2019","unstructured":"Lu, C., Shi, J., Wang, W., & Jia, J. (2019). Fast abnormal event detection. International Journal of Computer Vision, 127(8), 993\u20131011.","journal-title":"International Journal of Computer Vision"},{"key":"1661_CR36","doi-asserted-by":"crossref","unstructured":"Luo, C., & Yuille, A. L. (2019). Grouped spatial-temporal aggregation for efficient action recognition. In Proceedings of the IEEE international conference on computer vision (pp. 5512\u20135521).","DOI":"10.1109\/ICCV.2019.00561"},{"key":"1661_CR37","doi-asserted-by":"crossref","unstructured":"Ma, C. Y., Kadav, A., Melvin, I., Kira, Z., AlRegib, G., & Peter Graf, H. (2018). Attend and interact: Higher-order object interactions for video understanding. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 6790\u20136800).","DOI":"10.1109\/CVPR.2018.00710"},{"key":"1661_CR38","unstructured":"Mahdisoltani, F., Berger, G., Gharbieh, W., Fleet, D., & Memisevic, R. (2018). Fine-grained video classification and captioning. arXiv:1804.092355(6)"},{"key":"1661_CR39","doi-asserted-by":"crossref","unstructured":"Materzynska, J., Xiao, T., Herzig, R., Xu, H., Wang, X., & Darrell, T. (2020). Something-else: Compositional action recognition with spatial-temporal interaction networks. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 1049\u20131059).","DOI":"10.1109\/CVPR42600.2020.00113"},{"key":"1661_CR40","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2021.103219","volume":"208","author":"C Plizzari","year":"2021","unstructured":"Plizzari, C., Cannici, M., & Matteucci, M. (2021). Skeleton-based action recognition via spatial and temporal transformer networks. Computer Vision and Image Understanding, 208, 103219.","journal-title":"Computer Vision and Image Understanding"},{"key":"1661_CR41","doi-asserted-by":"crossref","unstructured":"Ranjan, A., & Black, M. J. (2017). Optical flow estimation using a spatial pyramid network. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 4161\u20134170).","DOI":"10.1109\/CVPR.2017.291"},{"issue":"6","key":"1661_CR42","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2016","unstructured":"Ren, S., He, K., Girshick, R., & Sun, J. (2016). Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence, 39(6), 1137\u20131149.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1661_CR43","unstructured":"Simonyan, K., & Zisserman, A. (2014). Two-stream convolutional networks for action recognition in videos. In Advances in neural information processing systems (pp. 568\u2013576)."},{"key":"1661_CR44","doi-asserted-by":"crossref","unstructured":"Srinivas, A., Lin, T. Y., Parmar, N., Shlens, J., Abbeel, P., & Vaswani, A. (2021). Bottleneck transformers for visual recognition. arXiv:2101.11605","DOI":"10.1109\/CVPR46437.2021.01625"},{"key":"1661_CR45","doi-asserted-by":"crossref","unstructured":"Sun, D., Yang, X., Liu, M. Y., & Kautz, J. (2018). Pwc-net: Cnns for optical flow using pyramid, warping, and cost volume. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 8934\u20138943).","DOI":"10.1109\/CVPR.2018.00931"},{"key":"1661_CR46","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Ioffe, S., Vanhoucke, V., & Alemi, A. A. (2017). Inception-v4, inception-resnet and the impact of residual connections on learning. In Thirty-first AAAI conference on artificial intelligence.","DOI":"10.1609\/aaai.v31i1.11231"},{"key":"1661_CR47","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., & Rabinovich, A. (2015). Going deeper with convolutions. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1\u20139).","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"1661_CR48","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., & Wojna, Z. (2016). Rethinking the inception architecture for computer vision. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 2818\u20132826).","DOI":"10.1109\/CVPR.2016.308"},{"key":"1661_CR49","doi-asserted-by":"crossref","unstructured":"Tian, Y., Che, Z., Bao, W., Zhai, G., & Gao, Z. (2020). Self-supervised motion representation via scattering local motion cues. In European conference on computer vision (pp. 71\u201389). Springer","DOI":"10.1007\/978-3-030-58568-6_5"},{"key":"1661_CR50","doi-asserted-by":"crossref","unstructured":"Tian, Y., Lu, G., Min, X., Che, Z., Zhai, G., Guo, G., & Gao, Z. (2021). Self-conditioned probabilistic learning of video rescaling. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 4490\u20134499).","DOI":"10.1109\/ICCV48922.2021.00445"},{"key":"1661_CR51","doi-asserted-by":"crossref","unstructured":"Tian, Y., Min, X., Zhai, G., & Gao, Z. (2019). Video-based early asd detection via temporal pyramid networks. In 2019 IEEE international conference on multimedia and expo (pp. 272\u2013277). IEEE.","DOI":"10.1109\/ICME.2019.00055"},{"key":"1661_CR52","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., & J\u00e9gou, H. (2020). Training data-efficient image transformers and distillation through attention. arXiv:2012.12877"},{"key":"1661_CR53","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., & Paluri, M. (2015). Learning spatiotemporal features with 3d convolutional networks. In Proceedings of the IEEE international conference on computer vision (pp. 4489\u20134497).","DOI":"10.1109\/ICCV.2015.510"},{"key":"1661_CR54","doi-asserted-by":"crossref","unstructured":"Tran, D., Wang, H., Torresani, L., Ray, J., LeCun, Y., & Paluri, M. (2018). A closer look at spatiotemporal convolutions for action recognition. In Proceedings of the IEEE conference on Computer Vision and Pattern Recognition (pp. 6450\u20136459).","DOI":"10.1109\/CVPR.2018.00675"},{"key":"1661_CR55","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, L., & Polosukhin, I. (2017). Attention is all you need. arXiv:1706.03762"},{"key":"1661_CR56","doi-asserted-by":"crossref","unstructured":"Wang, H., Tran, D., Torresani, L., & Feiszli, M. (2020). Video modeling with correlation networks. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 352\u2013361).","DOI":"10.1109\/CVPR42600.2020.00043"},{"key":"1661_CR57","doi-asserted-by":"crossref","unstructured":"Wang, L., Li, W., Li, W., & Van Gool, L. (2018). Appearance-and-relation networks for video classification. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1430\u20131439).","DOI":"10.1109\/CVPR.2018.00155"},{"key":"1661_CR58","doi-asserted-by":"crossref","unstructured":"Wang, L., Tong, Z., Ji, B., & Wu, G. (2020). Tdn: Temporal difference networks for efficient action recognition. arXiv:2012.10071","DOI":"10.1109\/CVPR46437.2021.00193"},{"key":"1661_CR59","doi-asserted-by":"crossref","unstructured":"Wang, L., Xiong, Y., Wang, Z., Qiao, Y., Lin, D., Tang, X., & Van Gool, L. (2016). Temporal segment networks: Towards good practices for deep action recognition. In European conference on computer vision (pp. 20\u201336). Springer.","DOI":"10.1007\/978-3-319-46484-8_2"},{"issue":"11","key":"1661_CR60","doi-asserted-by":"publisher","first-page":"2740","DOI":"10.1109\/TPAMI.2018.2868668","volume":"41","author":"L Wang","year":"2018","unstructured":"Wang, L., Xiong, Y., Wang, Z., Qiao, Y., Lin, D., Tang, X., & Van Gool, L. (2018). Temporal segment networks for action recognition in videos. IEEE Transactions on Pattern Analysis and Machine Intelligence, 41(11), 2740\u20132755.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1661_CR61","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R., Gupta, A., & He, K. (2018). Non-local neural networks. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 7794\u20137803).","DOI":"10.1109\/CVPR.2018.00813"},{"key":"1661_CR62","doi-asserted-by":"crossref","unstructured":"Wang, X., & Gupta, A. (2018). Videos as space-time region graphs. In Proceedings of the European conference on computer vision (pp. 399\u2013417).","DOI":"10.1007\/978-3-030-01228-1_25"},{"key":"1661_CR63","doi-asserted-by":"crossref","unstructured":"Wu, Z., Li, H., Zheng, Y., Xiong, C., Jiang, Y. G., & Davis, L. S. (2021). A coarse-to-fine framework for resource efficient video recognition. International Journal of Computer Vision.","DOI":"10.1007\/s11263-021-01508-1"},{"key":"1661_CR64","doi-asserted-by":"crossref","unstructured":"Xie, S., Sun, C., Huang, J., Tu, Z., & Murphy, K. (2018). Rethinking spatiotemporal feature learning: Speed-accuracy trade-offs in video classification. In Proceedings of the European conference on computer vision (pp. 305\u2013321).","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"1661_CR65","unstructured":"Yang, B., Bender, G., Le, Q.V., & Ngiam, J. (2019). Condconv: Conditionally parameterized convolutions for efficient inference. arXiv:1904.04971"},{"key":"1661_CR66","doi-asserted-by":"crossref","unstructured":"Zach, C., Pock, T., & Bischof, H. (2007). A duality based approach for realtime tv-l 1 optical flow. In Joint pattern recognition symposium (pp. 214\u2013223). Springer.","DOI":"10.1007\/978-3-540-74936-3_22"},{"key":"1661_CR67","doi-asserted-by":"crossref","unstructured":"Zhang, C., Zou, Y., Chen, G., & Gan, L. (2020). Pan: Towards fast action recognition via learning persistence of appearance. arXiv:2008.03462","DOI":"10.1145\/3343031.3350876"},{"key":"1661_CR68","doi-asserted-by":"crossref","unstructured":"Zhang, X., Zhou, X., Lin, M., & Sun, J. (2018). Shufflenet: An extremely efficient convolutional neural network for mobile devices. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 6848\u20136856).","DOI":"10.1109\/CVPR.2018.00716"},{"key":"1661_CR69","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Li, X., Liu, C., Shuai, B., Zhu, Y., Brattoli, B., Chen, H., Marsic, I., & Tighe, J. (2021). Vidtr: Video transformer without convolutions. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 13577\u201313587).","DOI":"10.1109\/ICCV48922.2021.01332"},{"key":"1661_CR70","doi-asserted-by":"crossref","unstructured":"Zhou, B., Andonian, A., Oliva, A., & Torralba, A. (2018). Temporal relational reasoning in videos. In Proceedings of the European conference on computer vision (pp. 803\u2013818).","DOI":"10.1007\/978-3-030-01246-5_49"},{"key":"1661_CR71","doi-asserted-by":"crossref","unstructured":"Zolfaghari, M., Singh, K., & Brox, T. (2018). Eco: Efficient convolutional network for online video understanding. In Proceedings of the European conference on computer vision (pp. 695\u2013712).","DOI":"10.1007\/978-3-030-01216-8_43"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-022-01661-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-022-01661-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-022-01661-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,9,10]],"date-time":"2022-09-10T10:13:27Z","timestamp":1662804807000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-022-01661-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,7]]},"references-count":71,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2022,10]]}},"alternative-id":["1661"],"URL":"https:\/\/doi.org\/10.1007\/s11263-022-01661-1","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,8,7]]},"assertion":[{"value":"9 September 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 July 2022","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 August 2022","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}