{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T17:37:12Z","timestamp":1772818632626,"version":"3.50.1"},"reference-count":63,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2023,10,9]],"date-time":"2023-10-09T00:00:00Z","timestamp":1696809600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,10,9]],"date-time":"2023-10-09T00:00:00Z","timestamp":1696809600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62171328"],"award-info":[{"award-number":["62171328"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Comput &amp; Applic"],"published-print":{"date-parts":[[2024,1]]},"DOI":"10.1007\/s00521-023-09069-9","type":"journal-article","created":{"date-parts":[[2023,10,9]],"date-time":"2023-10-09T11:04:12Z","timestamp":1696849452000},"page":"943-959","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Action recognition based on adaptive region perception"],"prefix":"10.1007","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3900-6456","authenticated-orcid":false,"given":"Tongwei","family":"Lu","sequence":"first","affiliation":[]},{"given":"Qi","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Feng","family":"Min","sequence":"additional","affiliation":[]},{"given":"Yanduo","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,10,9]]},"reference":[{"key":"9069_CR1","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1145\/3065386","volume":"60","author":"A Krizhevsky","year":"2017","unstructured":"Krizhevsky A, Sutskever I, Hinton GE (2017) Imagenet classification with deep convolutional neural networks. Commun ACM 60:84\u201390. https:\/\/doi.org\/10.1145\/3065386","journal-title":"Commun ACM"},{"key":"9069_CR2","doi-asserted-by":"publisher","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE computer society conference on computer vision and pattern recognition 2016-December, pp 770\u2013778. https:\/\/doi.org\/10.1109\/CVPR.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"issue":"7","key":"9069_CR3","doi-asserted-by":"publisher","first-page":"926","DOI":"10.1109\/LSP.2018.2822810","volume":"25","author":"F Wang","year":"2018","unstructured":"Wang F, Cheng J, Liu W, Liu H (2018) Additive margin softmax for face verification. IEEE Signal Process Lett 25(7):926\u2013930","journal-title":"IEEE Signal Process Lett"},{"issue":"6","key":"9069_CR4","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2017","unstructured":"Ren S, He K, Girshick R, Sun J (2017) Faster r-cnn: towards real-time object detection with region proposal networks. IEEE Trans Pattern Anal Mach Intell 39(6):1137\u20131149","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"9069_CR5","doi-asserted-by":"publisher","unstructured":"He K, Gkioxari G, Dollar P, Girshick R (2017). Mask R-CNN. https:\/\/doi.org\/10.1109\/ICCV.2017.322","DOI":"10.1109\/ICCV.2017.322"},{"key":"9069_CR6","doi-asserted-by":"publisher","unstructured":"Feichtenhofer C, Pinz A, Zisserman A (2016) Convolutional two-stream network fusion for video action recognition. In: Proceedings of the IEEE computer society conference on computer vision and pattern recognition 2016-December, pp 1933\u20131941. https:\/\/doi.org\/10.1109\/CVPR.2016.213","DOI":"10.1109\/CVPR.2016.213"},{"key":"9069_CR7","doi-asserted-by":"publisher","unstructured":"Carreira J, Zisserman A (2017) Quo vadis, action recognition? A new model and the kinetics dataset. In: Proceedings\u201430th IEEE conference on computer vision and pattern recognition, CVPR 2017 2017-January, pp 4724\u20134733. https:\/\/doi.org\/10.1109\/CVPR.2017.502","DOI":"10.1109\/CVPR.2017.502"},{"key":"9069_CR8","unstructured":"Chen Y, Kalantidis Y, Li J, Yan S, Feng J (2018) A2-nets: double attention networks. Adv Neural Inf Process Syst 352"},{"issue":"1","key":"9069_CR9","doi-asserted-by":"publisher","first-page":"60","DOI":"10.1007\/s11263-012-0594-8","volume":"103","author":"H Wang","year":"2013","unstructured":"Wang H, Klaser A, Schmid C, Liu C-L (2013) Dense trajectories and motion boundary descriptors for action recognition. Int J Comput Vis 103(1):60\u201379","journal-title":"Int J Comput Vis"},{"key":"9069_CR10","unstructured":"Yang B, Le QV, Bender G, Ngiam J (2019) Condconv: conditionally parameterized convolutions for efficient inference. Adv Neural Inf Process Syst 32"},{"key":"9069_CR11","doi-asserted-by":"publisher","unstructured":"Wang L, Xiong Y, Wang Z, Qiao Y, Lin D, Tang X, van Gool L (2016) Temporal segment networks: towards good practices for deep action recognition. Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics) 9912 LNCS, pp 20\u201336. https:\/\/doi.org\/10.1007\/978-3-319-46484-8_2","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"9069_CR12","doi-asserted-by":"publisher","unstructured":"Wang Y, Long M, Wang J, Yu PS (2017) Spatiotemporal pyramid network for video action recognition. https:\/\/doi.org\/10.1109\/CVPR.2017.226","DOI":"10.1109\/CVPR.2017.226"},{"key":"9069_CR13","doi-asserted-by":"publisher","unstructured":"Feichtenhofer C, Pinz A, Wildes RP (2017) Spatiotemporal multiplier networks for video action recognition. In: Proceedings\u201430th IEEE conference on computer vision and pattern recognition, CVPR 2017 2017-January, pp 7445\u20137454. https:\/\/doi.org\/10.1109\/CVPR.2017.787","DOI":"10.1109\/CVPR.2017.787"},{"key":"9069_CR14","doi-asserted-by":"publisher","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M (2015) Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE international conference on computer vision 2015 international conference on computer vision ICCV 2015, pp 4489\u20134497. https:\/\/doi.org\/10.1109\/ICCV.2015.510","DOI":"10.1109\/ICCV.2015.510"},{"key":"9069_CR15","doi-asserted-by":"publisher","unstructured":"Stroud JC, Ross DA, Sun C, Deng J, Sukthankar R (2020) D3d: Distilled 3d networks for video action recognition. In: Proceedings\u20142020 IEEE winter conference on applications of computer vision, WACV 2020, pp 614\u2013623. https:\/\/doi.org\/10.1109\/WACV45572.2020.9093274","DOI":"10.1109\/WACV45572.2020.9093274"},{"key":"9069_CR16","doi-asserted-by":"publisher","unstructured":"Bobick A, Davis J (1996) An appearance-based representation of action, vol 1, pp 307\u2013312. https:\/\/doi.org\/10.1109\/ICPR.1996.546039","DOI":"10.1109\/ICPR.1996.546039"},{"issue":"2\u20133 SPEC. ISS.","key":"9069_CR17","doi-asserted-by":"publisher","first-page":"249","DOI":"10.1016\/j.cviu.2006.07.013","volume":"104","author":"D Weinland","year":"2006","unstructured":"Weinland D, Ronfard R, Boyer E (2006) Free viewpoint action recognition using motion history volumes. Comput Vis Image Underst 104(2\u20133 SPEC. ISS.):249\u2013257","journal-title":"Comput Vis Image Underst"},{"key":"9069_CR18","doi-asserted-by":"publisher","unstructured":"Nguyen TP, Manzanera A (2013) Action recognition using bag of features extracted from a beam of trajectories, pp 4354\u20134357. https:\/\/doi.org\/10.1109\/ICIP.2013.6738897","DOI":"10.1109\/ICIP.2013.6738897"},{"key":"9069_CR19","doi-asserted-by":"crossref","unstructured":"Wang H, Schmid C (2013) Action recognition with improved trajectories. In: International conference on computer vision, pp 3551\u20133558","DOI":"10.1109\/ICCV.2013.441"},{"key":"9069_CR20","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.patcog.2017.06.022","volume":"72","author":"C Shi","year":"2017","unstructured":"Shi C, Wang Y, Jia F, He K, Wang C, Xiao B (2017) Fisher vector for scene character recognition: a comprehensive evaluation. Pattern Recognit 72:1\u201314","journal-title":"Pattern Recognit"},{"key":"9069_CR21","doi-asserted-by":"crossref","unstructured":"Danafar S, Gheissari N (2007) Action recognition for surveillance applications using optic flow and svm. Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics) (PART 2), pp 457\u2013466","DOI":"10.1007\/978-3-540-76390-1_45"},{"key":"9069_CR22","doi-asserted-by":"crossref","unstructured":"Oreifej O, Liu Z (2013) Hon4d: histogram of oriented 4d normals for activity recognition from depth sequences. In: Proceedings of the IEEE computer society conference on computer vision and pattern recognition, pp 716\u2013723","DOI":"10.1109\/CVPR.2013.98"},{"key":"9069_CR23","doi-asserted-by":"crossref","unstructured":"Wang Y, Zhou L, Qiao Y (2018) Temporal hallucinating for action recognition with few still images. In: Proceedings of the IEEE computer society conference on computer vision and pattern recognition, pp 5314\u20135322","DOI":"10.1109\/CVPR.2018.00557"},{"key":"9069_CR24","doi-asserted-by":"crossref","unstructured":"Wang L, Koniusz P, Huynh D (2019) Hallucinating idt descriptors and i3d optical flow features for action recognition with cnns. In: Proceedings of the IEEE international conference on computer vision, pp 8697\u20138707","DOI":"10.1109\/ICCV.2019.00879"},{"key":"9069_CR25","doi-asserted-by":"crossref","unstructured":"Tang Y, Ma L, Zhou L (2019) Hallucinating optical flow features for video classification. In: IJCAI international joint conference on artificial intelligence, pp 926\u2013932","DOI":"10.24963\/ijcai.2019\/130"},{"key":"9069_CR26","doi-asserted-by":"crossref","unstructured":"Wang L, Koniusz P (2021) Self-supervising action recognition by statistical moment and subspace descriptors. In: MM 2021\u2014proceedings of the 29th ACM international conference on multimedia, pp 4324\u20134333","DOI":"10.1145\/3474085.3475572"},{"key":"9069_CR27","first-page":"568","volume":"1","author":"K Simonyan","year":"2014","unstructured":"Simonyan K, Zisserman A (2014) Two-stream convolutional networks for action recognition in videos. Adv Neural Inf Process Syst 1:568\u2013576","journal-title":"Adv Neural Inf Process Syst"},{"key":"9069_CR28","doi-asserted-by":"publisher","unstructured":"Diba A, Sharma V, VanGool L (2017) Deep temporal linear encoding networks. In: Proceedings\u201430th IEEE conference on computer vision and pattern recognition, CVPR 2017 2017-January, pp 1541\u20131550. https:\/\/doi.org\/10.1109\/CVPR.2017.168","DOI":"10.1109\/CVPR.2017.168"},{"key":"9069_CR29","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1016\/j.patrec.2017.04.004","volume":"92","author":"L Wang","year":"2017","unstructured":"Wang L, Ge L, Li R, Fang Y (2017) Three-stream CNNs for action recognition. Pattern Recognit Lett 92:33\u201340. https:\/\/doi.org\/10.1016\/j.patrec.2017.04.004","journal-title":"Pattern Recognit Lett"},{"issue":"1","key":"9069_CR30","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2013","unstructured":"Ji S, Xu W, Yang M, Yu K (2013) 3d convolutional neural networks for human action recognition. IEEE Trans Pattern Anal Mach Intell 35(1):221\u2013231","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"16","key":"9069_CR31","first-page":"1","volume":"8","author":"D Tran","year":"2017","unstructured":"Tran D, Ray J, Shou Z, Chang S-F, Paluri M (2017) Convnet architecture search for spatiotemporal feature learning. Comput Res Repos 8(16):1\u201312","journal-title":"Comput Res Repos"},{"key":"9069_CR32","doi-asserted-by":"publisher","unstructured":"Hara K, Kataoka H, Satoh Y (2018) Can spatiotemporal 3d cnns retrace the history of 2d cnns and imagenet?. In: Proceedings of the IEEE computer society conference on computer vision and pattern recognition, pp 6546\u20136555. https:\/\/doi.org\/10.1109\/CVPR.2018.00685","DOI":"10.1109\/CVPR.2018.00685"},{"key":"9069_CR33","unstructured":"Diba A, Fayyaz M, Sharma V, Karami AH, Arzani MM, Yousefzadeh R, Van\u00a0Gool L (2017) Temporal 3D ConvNets: new architecture and transfer learning for video classification. https:\/\/arxiv.org\/abs\/1711.08200"},{"key":"9069_CR34","doi-asserted-by":"publisher","unstructured":"Zhou B, Andonian A, Oliva A, Torralba A (2018) Temporal relational reasoning in videos. Lecture notes in computer science (including subseries lecture notes in artificial intelligence and lecture notes in bioinformatics) 11205 LNCS, pp 831\u2013846. https:\/\/doi.org\/10.1007\/978-3-030-01246-5_49","DOI":"10.1007\/978-3-030-01246-5_49"},{"key":"9069_CR35","doi-asserted-by":"publisher","unstructured":"Zolfaghari M, Singh K, Brox T (2018) Eco: efficient convolutional network for online video understanding. Lecture Notes in computer science (including subseries lecture notes in artificial intelligence and lecture notes in bioinformatics) 11206 LNCS, pp 713\u2013730. https:\/\/doi.org\/10.1007\/978-3-030-01216-8_43","DOI":"10.1007\/978-3-030-01216-8_43"},{"key":"9069_CR36","doi-asserted-by":"publisher","unstructured":"Lee M, Lee S, Son S, Park G, Kwak N (2018) Motion feature network: fixed motion filter for action recognition. Lecture notes in computer science (including subseries lecture notes in artificial intelligence and lecture notes in bioinformatics) 11214 LNCS, pp 392\u2013408. https:\/\/doi.org\/10.1007\/978-3-030-01249-6_24","DOI":"10.1007\/978-3-030-01249-6_24"},{"key":"9069_CR37","doi-asserted-by":"publisher","unstructured":"Lin J, Gan C, Han S (2019) Tsm: temporal shift module for efficient video understanding. In: Proceedings of the IEEE international conference on computer vision 2019-October, pp 7082\u20137092. https:\/\/doi.org\/10.1109\/ICCV.2019.00718","DOI":"10.1109\/ICCV.2019.00718"},{"key":"9069_CR38","doi-asserted-by":"crossref","unstructured":"Shao H, Qian S, Liu Y (2020) Temporal interlacing network. In: Paper presented at the AAAI 2020\u201434th AAAI conference on artificial intelligence","DOI":"10.1609\/aaai.v34i07.6872"},{"key":"9069_CR39","doi-asserted-by":"crossref","unstructured":"Wang L, Koniusz P (2022) Uncertainty-DTW for time series and sequences","DOI":"10.1007\/978-3-031-19803-8_11"},{"key":"9069_CR40","doi-asserted-by":"crossref","unstructured":"Liu Z, Luo D, Wang Y, Wang L, Tai Y, Wang C, Li J, Huang F, Lu T (2020) TEINet: towards an efficient architecture for video recognition. In: Paper presented at the AAAI 2020\u201434th AAAI conference on artificial intelligence","DOI":"10.1609\/aaai.v34i07.6836"},{"key":"9069_CR41","doi-asserted-by":"publisher","unstructured":"Liu Z, Wang L, Wu W, Qian C, Lu T (2021) Tam: temporal adaptive module for video recognition. In: Proceedings of the IEEE international conference on computer vision, pp 13688\u201313698. https:\/\/doi.org\/10.1109\/ICCV48922.2021.01345","DOI":"10.1109\/ICCV48922.2021.01345"},{"key":"9069_CR42","doi-asserted-by":"publisher","unstructured":"Li Y, Ji B, Shi X, Zhang J, Kang B, Wang L (2020) Tea: temporal excitation and aggregation for action recognition. In: Proceedings of the IEEE computer society conference on computer vision and pattern recognition, pp 906\u2013915. https:\/\/doi.org\/10.1109\/CVPR42600.2020.00099","DOI":"10.1109\/CVPR42600.2020.00099"},{"key":"9069_CR43","doi-asserted-by":"publisher","unstructured":"Jiang B, Wang M, Gan W, Wu W, Yan J (2019) Stm: spatiotemporal and motion encoding for action recognition. In: Proceedings of the IEEE international conference on computer vision 2019-October, pp 2000\u20132009. https:\/\/doi.org\/10.1109\/ICCV.2019.00209","DOI":"10.1109\/ICCV.2019.00209"},{"key":"9069_CR44","doi-asserted-by":"publisher","unstructured":"Chen Y, Dai X, Liu M, Chen D, Yuan L, Liu Z (2020) Dynamic convolution: attention over convolution kernels. In: Proceedings of the IEEE computer society conference on computer vision and pattern recognition, pp 11027\u201311036. https:\/\/doi.org\/10.1109\/CVPR42600.2020.01104","DOI":"10.1109\/CVPR42600.2020.01104"},{"key":"9069_CR45","unstructured":"Zhang Y, Zhang J, Wang Q, Zhong Z (2020) Dynet: dynamic convolution for accelerating convolutional neural networks"},{"issue":"8","key":"9069_CR46","doi-asserted-by":"publisher","first-page":"2011","DOI":"10.1109\/TPAMI.2019.2913372","volume":"42","author":"J Hu","year":"2020","unstructured":"Hu J, Shen L, Albanie S, Sun G, Wu E (2020) Squeeze-and-excitation networks. IEEE Trans Pattern Anal Mach Intell 42(8):2011\u20132023","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"9069_CR47","doi-asserted-by":"publisher","unstructured":"Goyal R, Kahou SE, Michalski V, Materzynska J, Westphal S, Kim H, Haenel V, Fruend I, Yianilos P, Mueller-Freitag M, Hoppe F, Thurau C, Bax I, Memisevic R (2017) The \u2019something something\u2019 video database for learning and evaluating visual common sense. In: Proceedings of the IEEE international conference on computer vision 2017-October, pp 5843\u20135851. https:\/\/doi.org\/10.1109\/ICCV.2017.622","DOI":"10.1109\/ICCV.2017.622"},{"key":"9069_CR48","unstructured":"Kay W, Carreira J, Simonyan K, Zhang B, Hillier C, Vijayanarasimhan S, Viola F, Green T, Back T, Natsev P, Suleyman M, Zisserman A (2017) The kinetics human action video dataset"},{"key":"9069_CR49","doi-asserted-by":"publisher","unstructured":"Feichtenhofer C, Fan H, Malik J, He K (2019) Slowfast networks for video recognition. In: Proceedings of the IEEE international conference on computer vision 2019-October, pp 6201\u20136210. https:\/\/doi.org\/10.1109\/ICCV.2019.00630","DOI":"10.1109\/ICCV.2019.00630"},{"key":"9069_CR50","doi-asserted-by":"publisher","unstructured":"Wang X, Girshick R, Gupta A, He K (2018) Non-local neural networks. In: Proceedings of the IEEE computer society conference on computer vision and pattern recognition, pp 7794\u20137803. https:\/\/doi.org\/10.1109\/CVPR.2018.00813","DOI":"10.1109\/CVPR.2018.00813"},{"key":"9069_CR51","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, Li L-J, Li K, Li F-F (2009) Imagenet: a large-scale hierarchical image database, pp 248\u2013255","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"9069_CR52","doi-asserted-by":"publisher","unstructured":"Xie S, Sun C, Huang J, Tu Z, Murphy K (2018) Rethinking spatiotemporal feature learning: speed-accuracy trade-offs in video classification. Lecture notes in computer science (including subseries lecture notes in artificial intelligence and lecture notes in bioinformatics) 11219 LNCS, pp 318\u2013335. https:\/\/doi.org\/10.1007\/978-3-030-01267-0_19","DOI":"10.1007\/978-3-030-01267-0_19"},{"issue":"3","key":"9069_CR53","doi-asserted-by":"publisher","first-page":"368","DOI":"10.3390\/e24030368","volume":"24","author":"Q Yang","year":"2022","unstructured":"Yang Q, Lu T, Zhou H (2022) A spatio-temporal motion network for action recognition based on spatial attention. Entropy 24(3):368","journal-title":"Entropy"},{"key":"9069_CR54","doi-asserted-by":"publisher","unstructured":"Liu X, Lee J-Y, Jin H (2019) Learning video representations from correspondence proposals. In: Proceedings of the IEEE computer society conference on computer vision and pattern recognition, pp 4268\u20134276. https:\/\/doi.org\/10.1109\/CVPR.2019.00440","DOI":"10.1109\/CVPR.2019.00440"},{"key":"9069_CR55","unstructured":"Fan Q, Chen C-F, Kuehne H, Pistoia M, Cox D (2019) More is less: learning efficient video representations by big-little network and depthwise temporal aggregation. Adv Neural Inf Process Syst 32"},{"key":"9069_CR56","doi-asserted-by":"publisher","unstructured":"Li X, Wang Y, Zhou Z, Qiao Y (2020) Smallbignet: integrating core and contextual views for video classification. In: Proceedings of the IEEE computer society conference on computer vision and pattern recognition, pp 1089\u20131098. https:\/\/doi.org\/10.1109\/CVPR42600.2020.00117","DOI":"10.1109\/CVPR42600.2020.00117"},{"key":"9069_CR57","doi-asserted-by":"publisher","unstructured":"Wang L, Li W, Van\u00a0Gool L (2018) Appearance-and-relation networks for video classification. In: Proceedings of the IEEE computer society conference on computer vision and pattern recognition, pp 1430\u20131439. https:\/\/doi.org\/10.1109\/CVPR.2018.00155","DOI":"10.1109\/CVPR.2018.00155"},{"key":"9069_CR58","doi-asserted-by":"publisher","unstructured":"Feichtenhofer C (2020) X3d: expanding architectures for efficient video recognition. In: Proceedings of the IEEE computer society conference on computer vision and pattern recognition, pp 200\u2013210. https:\/\/doi.org\/10.1109\/CVPR42600.2020.00028","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"9069_CR59","doi-asserted-by":"publisher","unstructured":"Zhang Y, Li X, Liu C, Shuai B, Zhu Y, Brattoli B, Chen H, Marsic I, Tighe J (2021) Vidtr: video transformer without convolutions. In: Proceedings of the IEEE international conference on computer vision, pp 13557\u201313567. https:\/\/doi.org\/10.1109\/ICCV48922.2021.01332","DOI":"10.1109\/ICCV48922.2021.01332"},{"key":"9069_CR60","unstructured":"Bertasius G, Wang H, Torresani L (2021) Is space-time attention all you need for video understanding?"},{"key":"9069_CR61","doi-asserted-by":"publisher","unstructured":"Arnab A, Dehghani M, Heigold G, Sun C, Lui M, Schmid C (2021) Vivit: a video vision transformer. In: Proceedings of the IEEE international conference on computer vision, pp 6816\u20136826. https:\/\/doi.org\/10.1109\/ICCV48922.2021.00676","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"9069_CR62","doi-asserted-by":"publisher","unstructured":"Fan H, Xiong B, Mangalam K, Li Y, Yan Z, Malik J, Feichtenhofer C (2021) Multiscale vision transformers. In: Proceedings of the IEEE international conference on computer vision, pp 6804\u20136815. https:\/\/doi.org\/10.1109\/ICCV48922.2021.00675","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"9069_CR63","first-page":"12493","volume":"15","author":"M Patrick","year":"2021","unstructured":"Patrick M, Campbell D, Asano Y, Misra I, Metze F, Feichtenhofer C, Vedaldi A, Henriques JF (2021) Keeping your eye on the ball: trajectory attention in video transformers. Adv Neural Inf Process Syst 15:12493\u201312506","journal-title":"Adv Neural Inf Process Syst"}],"container-title":["Neural Computing and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-023-09069-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00521-023-09069-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-023-09069-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,5]],"date-time":"2024-01-05T07:12:05Z","timestamp":1704438725000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00521-023-09069-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,9]]},"references-count":63,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2024,1]]}},"alternative-id":["9069"],"URL":"https:\/\/doi.org\/10.1007\/s00521-023-09069-9","relation":{},"ISSN":["0941-0643","1433-3058"],"issn-type":[{"value":"0941-0643","type":"print"},{"value":"1433-3058","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,10,9]]},"assertion":[{"value":"18 November 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 September 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 October 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}