{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T13:26:19Z","timestamp":1778851579779,"version":"3.51.4"},"reference-count":68,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2023,4,18]],"date-time":"2023-04-18T00:00:00Z","timestamp":1681776000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,4,18]],"date-time":"2023-04-18T00:00:00Z","timestamp":1681776000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Process Lett"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1007\/s11063-023-11270-9","type":"journal-article","created":{"date-parts":[[2023,4,18]],"date-time":"2023-04-18T11:10:21Z","timestamp":1681816221000},"page":"7493-7509","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["GSoANet: Group Second-Order Aggregation Network for Video Action Recognition"],"prefix":"10.1007","volume":"55","author":[{"given":"Zhenwei","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wei","family":"Dong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bingbing","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jianxin","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiangdong","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bin","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qiang","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,4,18]]},"reference":[{"key":"11270_CR1","unstructured":"Simonyan K, Zisserman A (2014) Two-stream convolutional networks for action recognition in videos. In: NIPS, vol. 27"},{"key":"11270_CR2","doi-asserted-by":"crossref","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M (2015) Learning spatiotemporal features with 3D convolutional networks. In: ICCV, pp 4489\u20134497","DOI":"10.1109\/ICCV.2015.510"},{"key":"11270_CR3","doi-asserted-by":"crossref","unstructured":"Lin J, Gan C, Han S (2019) TSM: Temporal shift module for efficient video understanding. In: ICCV, pp. 7083\u20137093","DOI":"10.1109\/ICCV.2019.00718"},{"key":"11270_CR4","doi-asserted-by":"crossref","unstructured":"Neimark D, Bar O, Zohar M, Asselmann D (2021) Video transformer network. In: ICCV, pp 3163\u20133172","DOI":"10.1109\/ICCVW54120.2021.00355"},{"key":"11270_CR5","unstructured":"Wang L, Xiong Y, Wang Z, Qiao Y (2015) Towards good practices for very deep two-stream convnets. arXiv preprint arXiv:1507.02159"},{"key":"11270_CR6","unstructured":"Christoph R, Pinz FA (2016) Spatiotemporal residual networks for video action recognition. arXiv preprint arXiv:1611.02155"},{"key":"11270_CR7","doi-asserted-by":"publisher","first-page":"1320","DOI":"10.1109\/LSP.2020.3011326","volume":"27","author":"T Liu","year":"2020","unstructured":"Liu T, Zhao R, Xiao J, Lam K-M (2020) Progressive motion representation distillation with two-branch networks for egocentric activity recognition. IEEE Signal Process Lett 27:1320\u20131324","journal-title":"IEEE Signal Process Lett"},{"key":"11270_CR8","doi-asserted-by":"crossref","unstructured":"Hara K, Kataoka H, Satoh Y (2018) Can spatiotemporal 3D CNNs retrace the history of 2D CNNs and imagenet? In: CVPR, pp 6546\u20136555","DOI":"10.1109\/CVPR.2018.00685"},{"key":"11270_CR9","doi-asserted-by":"crossref","unstructured":"Tran D, Wang H, Torresani L, Ray J, LeCun Y, Paluri M (2018) A closer look at spatiotemporal convolutions for action recognition. In: CVPR, pp 6450\u20136459","DOI":"10.1109\/CVPR.2018.00675"},{"key":"11270_CR10","unstructured":"Du X, Li Y, Cui Y, Qian R, Li J, Bello I (2021) Revisiting 3D ResNets for video recognition. arXiv preprint arXiv:2109.01696"},{"key":"11270_CR11","doi-asserted-by":"crossref","unstructured":"Li Y, Ji B, Shi X, Zhang J, Kang B, Wang L (2020) TEA: Temporal excitation and aggregation for action recognition. In: CVPR, pp 909\u2013918","DOI":"10.1109\/CVPR42600.2020.00099"},{"key":"11270_CR12","unstructured":"Sharir G, Noy A, Zelnik-Manor L (2021) An image is worth 16x16 words, what is a video worth? arXiv preprint arXiv:2103.13915"},{"key":"11270_CR13","unstructured":"Bertasius G, Wang H, Torresani L (2021) Is space-time attention all you need for video understanding? In: ICML, vol. 139, pp 813\u2013824"},{"key":"11270_CR14","doi-asserted-by":"crossref","unstructured":"Yan S, Xiong X, Arnab A, Lu Z, Zhang M, Sun C, Schmid C (2022) Multiview transformers for video recognition. arXiv preprint arXiv:2201.04288","DOI":"10.1109\/CVPR52688.2022.00333"},{"key":"11270_CR15","doi-asserted-by":"crossref","unstructured":"Wu C-Y, Li Y, Mangalam K, Fan H, Xiong B, Malik J, Feichtenhofer C (2022) Memvit: Memory-augmented multiscale vision transformer for efficient long-term video recognition. In: CVPR, pp 13587\u201313597","DOI":"10.1109\/CVPR52688.2022.01322"},{"key":"11270_CR16","doi-asserted-by":"crossref","unstructured":"Wang H, Kl\u00e4ser A, Schmid C, Liu C-L (2011) Action recognition by dense trajectories. In: CVPR, pp. 3169\u20133176","DOI":"10.1109\/CVPR.2011.5995407"},{"key":"11270_CR17","doi-asserted-by":"crossref","unstructured":"Wang H, Schmid C (2013) Action recognition with improved trajectories. In: ICCV, pp 3551\u20133558","DOI":"10.1109\/ICCV.2013.441"},{"key":"11270_CR18","doi-asserted-by":"crossref","unstructured":"J\u00e9gou H, Douze M, Schmid C, P\u00e9rez P (2010) Aggregating local descriptors into a compact image representation. In: CVPR, pp 3304\u20133311","DOI":"10.1109\/CVPR.2010.5540039"},{"issue":"3","key":"11270_CR19","doi-asserted-by":"publisher","first-page":"222","DOI":"10.1007\/s11263-013-0636-x","volume":"105","author":"J S\u00e1nchez","year":"2013","unstructured":"S\u00e1nchez J, Perronnin F, Mensink T, Verbeek J (2013) Image classification with the fisher vector: theory and practice. Int J Comput Vision 105(3):222\u2013245","journal-title":"Int J Comput Vision"},{"key":"11270_CR20","unstructured":"Canas G, Poggio T, Rosasco L (2012) Learning manifolds with k-means and k-flats. Adv Neural Inform Process Syst 25"},{"key":"11270_CR21","doi-asserted-by":"crossref","unstructured":"Arandjelovic R, Gronat P, Torii A, Pajdla T, Sivic J (2016) NetVLAD: CNN architecture for weakly supervised place recognition. In: CVPR, pp 5297\u20135307","DOI":"10.1109\/CVPR.2016.572"},{"key":"11270_CR22","unstructured":"Miech A, Laptev I, Sivic J (2017) Learnable pooling with context gating for video classification. arXiv preprint arXiv:1706.06905"},{"key":"11270_CR23","doi-asserted-by":"crossref","unstructured":"Girdhar R, Ramanan D, Gupta A, Sivic J, Russell B (2017) Actionvlad: Learning spatio-temporal aggregation for action classification. In: CVPR, pp 971\u2013980","DOI":"10.1109\/CVPR.2017.337"},{"key":"11270_CR24","doi-asserted-by":"publisher","first-page":"174","DOI":"10.1016\/j.neucom.2017.12.020","volume":"282","author":"Q Sun","year":"2018","unstructured":"Sun Q, Wang Q, Zhang J, Li P (2018) Hyperlayer bilinear pooling with application to fine-grained categorization and image retrieval. Neurocomputing 282:174\u2013183","journal-title":"Neurocomputing"},{"key":"11270_CR25","doi-asserted-by":"crossref","unstructured":"Li P, Xie J, Wang Q, Zuo W (2017) Is second-order information helpful for large-scale visual recognition? In: ICCV, pp 2070\u20132078","DOI":"10.1109\/ICCV.2017.228"},{"key":"11270_CR26","doi-asserted-by":"crossref","unstructured":"Li P, Xie J, Wang Q, Gao Z (2018) Towards faster training of global covariance pooling networks by iterative matrix square root normalization. In: CVPR, pp 947\u2013955","DOI":"10.1109\/CVPR.2018.00105"},{"key":"11270_CR27","doi-asserted-by":"crossref","unstructured":"Wang Q, Li P, Hu Q, Zhu P, Zuo W (2019) Deep global generalized gaussian networks. In: CVPR, pp 5080\u20135088","DOI":"10.1109\/CVPR.2019.00522"},{"key":"11270_CR28","doi-asserted-by":"crossref","unstructured":"Liu Z, Mao H, Wu C-Y, Feichtenhofer C, Darrell T, Xie S (2022) A ConvNet for the 2020s. In: CVPR, pp 11976\u201311986","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"11270_CR29","doi-asserted-by":"crossref","unstructured":"Lin R, Xiao J, Fan J (2018) NeXtVLAD: An efficient neural network to aggregate frame-level features for large-scale video classification. In: ECCV","DOI":"10.1007\/978-3-030-11018-5_19"},{"key":"11270_CR30","doi-asserted-by":"crossref","unstructured":"Wang L, Li W, Li W, Van\u00a0Gool L (2018) Appearance-and-relation networks for video classification. In: CVPR, pp 1430\u20131439","DOI":"10.1109\/CVPR.2018.00155"},{"key":"11270_CR31","doi-asserted-by":"crossref","unstructured":"Zhou B, Andonian A, Oliva A, Torralba A (2018) Temporal relational reasoning in videos. In: ECCV, pp 803\u2013818","DOI":"10.1007\/978-3-030-01246-5_49"},{"key":"11270_CR32","doi-asserted-by":"crossref","unstructured":"Li X, Wang Y, Zhou Z, Qiao Y (2020) Smallbignet: Integrating core and contextual views for video classification. In: CVPR, pp 1092\u20131101","DOI":"10.1109\/CVPR42600.2020.00117"},{"key":"11270_CR33","doi-asserted-by":"crossref","unstructured":"Wang L, Tong Z, Ji B, Wu G (2021) TDN: Temporal difference networks for efficient action recognition. In: CVPR, pp 1895\u20131904","DOI":"10.1109\/CVPR46437.2021.00193"},{"key":"11270_CR34","unstructured":"Huang Z, Zhang S, Pan L, Qing Z, Tang M, Liu Z, Ang\u00a0Jr MH (2021) TAda! temporally-adaptive convolutions for video understanding. arXiv preprint arXiv:2110.06178"},{"key":"11270_CR35","doi-asserted-by":"crossref","unstructured":"Philbin J, Chum O, Isard M, Sivic J, Zisserman A (2007) Object retrieval with large vocabularies and fast spatial matching. In: 2007 IEEE conference on computer vision and pattern recognition, pp 1\u20138","DOI":"10.1109\/CVPR.2007.383172"},{"key":"11270_CR36","doi-asserted-by":"crossref","unstructured":"Laptev I, Marszalek M, Schmid C, Rozenfeld B (2008) Learning realistic human actions from movies. In: CVPR, pp 1\u20138","DOI":"10.1109\/CVPR.2008.4587756"},{"key":"11270_CR37","doi-asserted-by":"crossref","unstructured":"Schuldt C, Laptev I, Caputo B (2004) Recognizing human actions: a local SVM approach. In: ICPR, vol. 3, pp 32\u201336","DOI":"10.1109\/ICPR.2004.1334462"},{"issue":"10","key":"11270_CR38","doi-asserted-by":"publisher","first-page":"4933","DOI":"10.1109\/TIP.2018.2846664","volume":"27","author":"Y Xu","year":"2018","unstructured":"Xu Y, Han Y, Hong R, Tian Q (2018) Sequential video VLAD: Training the aggregation locally and temporally. IEEE Trans Image Process 27(10):4933\u20134944","journal-title":"IEEE Trans Image Process"},{"key":"11270_CR39","doi-asserted-by":"crossref","unstructured":"Lin T-Y, RoyChowdhury A, Maji S (2015) Bilinear CNN models for fine-grained visual recognition. In: ICCV, pp 1449\u20131457","DOI":"10.1109\/ICCV.2015.170"},{"key":"11270_CR40","doi-asserted-by":"crossref","unstructured":"Gao Y, Beijbom O, Zhang N, Darrell T (2016) Compact bilinear pooling. In: CVPR, pp 317\u2013326","DOI":"10.1109\/CVPR.2016.41"},{"key":"11270_CR41","doi-asserted-by":"publisher","first-page":"107167","DOI":"10.1016\/j.patcog.2019.107167","volume":"100","author":"B Zhang","year":"2020","unstructured":"Zhang B, Wang Q, Lu X, Wang F, Li P (2020) Locality-constrained affine subspace coding for image classification and retrieval. Pattern Recogn 100:107167","journal-title":"Pattern Recogn"},{"key":"11270_CR42","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1016\/j.neucom.2021.03.003","volume":"445","author":"Q Sun","year":"2021","unstructured":"Sun Q, Zhang Z, Li P (2021) Second-order encoding networks for semantic segmentation. Neurocomputing 445:50\u201360","journal-title":"Neurocomputing"},{"key":"11270_CR43","doi-asserted-by":"crossref","unstructured":"Diba A, Sharma V, Van\u00a0Gool L (2017) Deep temporal linear encoding networks. In: CVPR, pp 2329\u20132338","DOI":"10.1109\/CVPR.2017.168"},{"key":"11270_CR44","unstructured":"Girdhar R, Ramanan D (2017) Attentional pooling for action recognition. In: NIPS, vol. 30"},{"key":"11270_CR45","doi-asserted-by":"crossref","unstructured":"Zhu X, Xu C, Hui L, Lu C, Tao D (2019) Approximated bilinear modules for temporal modeling. In: ICCV, pp 3494\u20133503","DOI":"10.1109\/ICCV.2019.00359"},{"key":"11270_CR46","doi-asserted-by":"crossref","unstructured":"Li Y, Song S, Li Y, Liu J (2019) Temporal bilinear networks for video action recognition. In: AAAI, vol. 33, pp 8674\u20138681","DOI":"10.1609\/aaai.v33i01.33018674"},{"key":"11270_CR47","unstructured":"Gao Z, Wang Q, Zhang B, Hu Q, Li P (2021) Temporal-attentive covariance pooling networks for video recognition. In: NIPS, vol. 34, pp 13587\u201313598"},{"key":"11270_CR48","doi-asserted-by":"crossref","unstructured":"Carreira J, Zisserman A (2017) Quo vadis, action recognition? a new model and the kinetics dataset. In: CVPR, pp 6299\u20136308","DOI":"10.1109\/CVPR.2017.502"},{"key":"11270_CR49","doi-asserted-by":"crossref","unstructured":"Xie S, Sun C, Huang J, Tu Z, Murphy K (2018) Rethinking spatiotemporal feature learning: Speed-accuracy trade-offs in video classification. In: ECCV, pp 305\u2013321","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"11270_CR50","unstructured":"Soomro K, Zamir AR, Shah M (2012) UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402"},{"key":"11270_CR51","doi-asserted-by":"crossref","unstructured":"Kuehne H, Jhuang H, Garrote E, Poggio T, Serre T (2011) HMDB: A large video database for human motion recognition. In: ICCV, pp 2556\u20132563","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"11270_CR52","unstructured":"Kingma DP, Ba J (2014) Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980"},{"key":"11270_CR53","doi-asserted-by":"crossref","unstructured":"Crasto N, Weinzaepfel P, Alahari K, Schmid C (2019) MARS: Motion-augmented RGB stream for action recognition. In: CVPR, pp 7882\u20137891","DOI":"10.1109\/CVPR.2019.00807"},{"key":"11270_CR54","unstructured":"Zhang S, Guo S, Huang W, Scott MR, Wang L (2020) V4D: 4D convolutional neural networks for video-level representation learning. arXiv preprint arXiv:2002.07442"},{"key":"11270_CR55","doi-asserted-by":"crossref","unstructured":"Chi L, Yuan Z, Mu Y, Wang C (2020) Non-local neural networks with grouped bilinear attentional transforms. In: CVPR, pp 11804\u201311813","DOI":"10.1109\/CVPR42600.2020.01182"},{"key":"11270_CR56","doi-asserted-by":"crossref","unstructured":"Pang B, Peng G, Li Y, Lu C (2021) PGT: A progressive method for training models on long videos. In: CVPR, pp 11379\u201311389","DOI":"10.1109\/CVPR46437.2021.01122"},{"key":"11270_CR57","doi-asserted-by":"crossref","unstructured":"Li X, Liu C, Shuai B, Zhu Y, Chen H, Tighe J (2022) NUTA: Non-uniform temporal aggregation for action recognition. In: WACV, pp 3683\u20133692","DOI":"10.1109\/WACV51458.2022.00090"},{"key":"11270_CR58","doi-asserted-by":"crossref","unstructured":"Feichtenhofer C, Fan H, Malik J, He K (2019) Slowfast networks for video recognition. In: ICCV, pp 6202\u20136211","DOI":"10.1109\/ICCV.2019.00630"},{"key":"11270_CR59","doi-asserted-by":"crossref","unstructured":"Yang C, Xu Y, Shi J, Dai B, Zhou B (2020) Temporal pyramid network for action recognition. In: CVPR, pp 591\u2013600","DOI":"10.1109\/CVPR42600.2020.00067"},{"key":"11270_CR60","doi-asserted-by":"crossref","unstructured":"Jiang Y, Gong X, Wu J, Shi H, Yan Z, Wang Z (2022) Auto-X3D: Ultra-efficient video understanding via finer-grained neural architecture search. In: WACV, pp 2554\u20132563","DOI":"10.1109\/WACV51458.2022.00241"},{"issue":"3","key":"11270_CR61","doi-asserted-by":"publisher","first-page":"1616","DOI":"10.3390\/s23031616","volume":"23","author":"R Sun","year":"2023","unstructured":"Sun R, Zhang T, Wan Y, Zhang F, Wei J (2023) Wlit: Windows and linear transformer for video action recognition. Sensors 23(3):1616","journal-title":"Sensors"},{"key":"11270_CR62","doi-asserted-by":"crossref","unstructured":"Wang H, Tran D, Torresani L, Feiszli M (2020) Video modeling with correlation networks. In: CVPR, pp 352\u2013361","DOI":"10.1109\/CVPR42600.2020.00043"},{"key":"11270_CR63","doi-asserted-by":"crossref","unstructured":"Zhou Y, Sun X, Zha Z-J, Zeng W (2018) MiCT: Mixed 3D\/2D convolutional tube for human action recognition. In: CVPR, pp 449\u2013458","DOI":"10.1109\/CVPR.2018.00054"},{"key":"11270_CR64","doi-asserted-by":"publisher","first-page":"14969","DOI":"10.1109\/ACCESS.2019.2894025","volume":"7","author":"Z Liu","year":"2019","unstructured":"Liu Z, Hu H (2019) Spatiotemporal relation networks for video action recognition. IEEE Access 7:14969\u201314976","journal-title":"IEEE Access"},{"issue":"3","key":"11270_CR65","doi-asserted-by":"publisher","first-page":"0265115","DOI":"10.1371\/journal.pone.0265115","volume":"17","author":"G Yang","year":"2022","unstructured":"Yang G, Yang Y, Lu Z, Yang J, Liu D, Zhou C, Fan Z (2022) STA-TSN: Spatial-temporal attention temporal segment network for action recognition in video. PLoS ONE 17(3):0265115","journal-title":"PLoS ONE"},{"key":"11270_CR66","doi-asserted-by":"crossref","unstructured":"Liu Z, Luo D, Wang Y, Wang L, Tai Y, Wang C, Li J, Huang F, Lu T (2020) TEINet: Towards an efficient architecture for video recognition. In: AAAI, vol. 34, pp 11669\u201311676","DOI":"10.1609\/aaai.v34i07.6836"},{"key":"11270_CR67","doi-asserted-by":"crossref","unstructured":"Zhang Y, Li X, Liu C, Shuai B, Zhu Y, Brattoli B, Chen H, Marsic I, Tighe J (2021) VidTr: Video transformer without convolutions. In: ICCV, pp 13577\u201313587","DOI":"10.1109\/ICCV48922.2021.01332"},{"issue":"3","key":"11270_CR68","doi-asserted-by":"publisher","first-page":"1707","DOI":"10.3390\/s23031707","volume":"23","author":"B Chen","year":"2023","unstructured":"Chen B, Meng F, Tang H, Tong G (2023) Two-level attention module based on spurious-3d residual networks for human action recognition. Sensors 23(3):1707","journal-title":"Sensors"}],"container-title":["Neural Processing Letters"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-023-11270-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11063-023-11270-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-023-11270-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,10,28]],"date-time":"2023-10-28T19:13:00Z","timestamp":1698520380000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11063-023-11270-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,4,18]]},"references-count":68,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2023,12]]}},"alternative-id":["11270"],"URL":"https:\/\/doi.org\/10.1007\/s11063-023-11270-9","relation":{},"ISSN":["1370-4621","1573-773X"],"issn-type":[{"value":"1370-4621","type":"print"},{"value":"1573-773X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,4,18]]},"assertion":[{"value":"24 March 2023","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 April 2023","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflicts of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}