{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T20:28:29Z","timestamp":1771705709405,"version":"3.50.1"},"reference-count":55,"publisher":"Springer Science and Business Media LLC","issue":"9","license":[{"start":{"date-parts":[[2023,12,7]],"date-time":"2023-12-07T00:00:00Z","timestamp":1701907200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,12,7]],"date-time":"2023-12-07T00:00:00Z","timestamp":1701907200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2024,9]]},"DOI":"10.1007\/s00371-023-03165-6","type":"journal-article","created":{"date-parts":[[2023,12,7]],"date-time":"2023-12-07T13:02:08Z","timestamp":1701954128000},"page":"6279-6293","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["STAM: a spatio-temporal adaptive module for improving static convolutions in action recognition"],"prefix":"10.1007","volume":"40","author":[{"given":"Wei","family":"Li","sequence":"first","affiliation":[]},{"given":"Weijun","family":"Gong","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6564-4745","authenticated-orcid":false,"given":"Yurong","family":"Qian","sequence":"additional","affiliation":[]},{"given":"Haichen","family":"Tian","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,12,7]]},"reference":[{"key":"3165_CR1","doi-asserted-by":"crossref","unstructured":"Goyal, R., Ebrahimi Kahou, S., Michalski, V., et al.: The\u201c something something\u201d video database for learning and evaluating visual common sense. In Proceedings of the IEEE International Conference on Computer Vision 5842\u20135850 (2017)","DOI":"10.1109\/ICCV.2017.622"},{"key":"3165_CR2","unstructured":"Kay, W., Carreira, J., Simonyan, K., et al.: The kinetics human action video dataset. (2017) arXiv preprint arXiv:1705.06950"},{"key":"3165_CR3","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., et al.: Learning spatiotemporal features with 3d convolutional networks. In Proceedings of the IEEE International Conference on Computer Vision, pp. 4489\u20134497 (2015)","DOI":"10.1109\/ICCV.2015.510"},{"issue":"18","key":"3165_CR4","doi-asserted-by":"publisher","first-page":"3290","DOI":"10.3390\/math10183290","volume":"10","author":"Z Yang","year":"2022","unstructured":"Yang, Z., An, G., Zhang, R.: STSM: spatio-temporal shift module for efficient action recognition. Mathematics 10(18), 3290 (2022)","journal-title":"Mathematics"},{"key":"3165_CR5","doi-asserted-by":"crossref","unstructured":"Lin, J., Gan, C., Han, S.: Tsm: temporal shift module for efficient video understanding. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7083\u20137093 (2019)","DOI":"10.1109\/ICCV.2019.00718"},{"key":"3165_CR6","doi-asserted-by":"crossref","unstructured":"Liu, Z., Wang, L., Wu, W., et al.: Tam: Temporal adaptive module for video recognition. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13708\u201313718 (2021)","DOI":"10.1109\/ICCV48922.2021.01345"},{"key":"3165_CR7","doi-asserted-by":"crossref","unstructured":"Qiu, Z., Yao, T., Mei, T.: Learning spatio-temporal representation with pseudo-3d residual networks. In Proceedings of the IEEE International Conference on Computer Vision, pp. 5533\u20135541 (2017)","DOI":"10.1109\/ICCV.2017.590"},{"key":"3165_CR8","doi-asserted-by":"crossref","unstructured":"Tran, D., Wang, H., Torresani, L., et al.: A closer look at spatiotemporal convolutions for action recognition. In Proceedings of the IEEE conference on Computer Vision and Pattern Recognition, pp. 6450\u20136459 (2018)","DOI":"10.1109\/CVPR.2018.00675"},{"key":"3165_CR9","doi-asserted-by":"crossref","unstructured":"Yue-Hei Ng, J., Hausknecht, M., Vijayanarasimhan, S., et al.: Beyond short snippets: deep networks for video classification. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4694\u20134702 (2015)","DOI":"10.1109\/CVPR.2015.7299101"},{"key":"3165_CR10","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R., Gupta, A., et al.: Non-local neural networks. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7794\u20137803 (2018)","DOI":"10.1109\/CVPR.2018.00813"},{"key":"3165_CR11","doi-asserted-by":"crossref","unstructured":"Zhou, J., Jampani, V., Pi, Z., et al.: Decoupled dynamic filter networks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6647\u20136656 (2021)","DOI":"10.1109\/CVPR46437.2021.00658"},{"key":"3165_CR12","unstructured":"Elsayed, G., Ramachandran, P., Shlens, J., et al.: Revisiting spatial invariance with low-rank local connectivity. In International Conference on Machine Learning. PMLR, pp. 2868\u20132879 (2020)"},{"key":"3165_CR13","unstructured":"Huang, Z., Zhang, S., Pan, L., et al.: TAda! temporally-adaptive convolutions for video understanding. (2021) arXiv preprint arXiv:2110.06178"},{"key":"3165_CR14","doi-asserted-by":"crossref","unstructured":"Li, D., Hu, J., Wang, C., et al.: Involution: Inverting the inherence of convolution for visual recognition. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12321\u201312330 (2021)","DOI":"10.1109\/CVPR46437.2021.01214"},{"key":"3165_CR15","doi-asserted-by":"crossref","unstructured":"Dai, J., Qi, H., Xiong, Y., et al.: Deformable convolutional networks. In Proceedings of the IEEE International Conference on Computer Vision, pp. 764\u2013773 (2017)","DOI":"10.1109\/ICCV.2017.89"},{"key":"3165_CR16","doi-asserted-by":"crossref","unstructured":"Su, H., Jampani, V., Sun, D., et al.: Pixel-adaptive convolutional neural networks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11166\u201311175 (2019)","DOI":"10.1109\/CVPR.2019.01142"},{"key":"3165_CR17","doi-asserted-by":"crossref","unstructured":"Lin, X., Ma, L., Liu, W., et al.: Context-gated convolution. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XVIII 16, pp. 701\u2013718. Springer International Publishing (2020)","DOI":"10.1007\/978-3-030-58523-5_41"},{"key":"3165_CR18","unstructured":"Li, C., Zhou, A., Yao, A.: Omni-dimensional dynamic convolution (2022) arXiv preprint arXiv:2209.07947"},{"key":"3165_CR19","doi-asserted-by":"crossref","unstructured":"Chen, Y., Dai, X., Liu, M., et al.: Dynamic convolution: Attention over convolution kernels. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11030\u201311039 (2020)","DOI":"10.1109\/CVPR42600.2020.01104"},{"key":"3165_CR20","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., et al.: Deep residual learning for image recognition. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"3165_CR21","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., et al.: Slowfast networks for video recognition. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6202\u20136211 (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"3165_CR22","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C.: X3d: Expanding architectures for efficient video recognition. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 203\u2013213 (2020)","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"3165_CR23","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. Adv. Neural Inf. Process. Syst. 27 (2014)"},{"key":"3165_CR24","first-page":"20","volume-title":"European Conference on Computer Vision","author":"L Wang","year":"2016","unstructured":"Wang, L., Xiong, Y., Wang, Z., et al.: Temporal segment networks: towards good practices for deep action recognition. In: European Conference on Computer Vision, pp. 20\u201336. Springer, Cham (2016)"},{"key":"3165_CR25","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., Sun, G.: Squeeze-and-excitation networks. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7132\u20137141 (2018)","DOI":"10.1109\/CVPR.2018.00745"},{"key":"3165_CR26","unstructured":"Park, J., Woo, S., Lee, J.Y., et al.: Bam: Bottleneck attention module (2018) arXiv preprint arXiv:1807.06514"},{"key":"3165_CR27","doi-asserted-by":"crossref","unstructured":"Li, X., Wang, W., Hu, X., et al.: Selective kernel networks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 510\u2013519 (2019)","DOI":"10.1109\/CVPR.2019.00060"},{"key":"3165_CR28","unstructured":"Jaderberg, M., Simonyan, K., Zisserman, A., Kavukcuoglu, K.: Spatial transformer networks. In Proceedings of Neural Information Processing Systems (2015)"},{"key":"3165_CR29","unstructured":"Yang, B., Bender, G., Le, Q.V., et al.: Condconv: conditionally parameterized convolutions for efficient inference. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"3165_CR30","first-page":"776","volume-title":"European Conference on Computer Vision","author":"N Ma","year":"2020","unstructured":"Ma, N., Zhang, X., Huang, J., et al.: Weightnet: revisiting the design space of weight networks. In: European Conference on Computer Vision, pp. 776\u2013792. Springer International Publishing, Cham (2020)"},{"key":"3165_CR31","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: accelerating deep network training by reducing internal covariate shift. In International Conference on Machine Learning. PMLR, pp. 448\u2013456 (2015)"},{"key":"3165_CR32","unstructured":"Nair, V., Hinton, G.E.: Rectified linear units improve restricted boltzmann machines. In Proceedings of the 27th International Conference on Machine Learning (ICML-10), pp. 807\u2013814 (2010)"},{"issue":"4","key":"3165_CR33","doi-asserted-by":"publisher","first-page":"733","DOI":"10.1007\/s41095-023-0364-2","volume":"9","author":"MH Guo","year":"2023","unstructured":"Guo, M.H., Lu, C.Z., Liu, Z.N., et al.: Visual attention network. Comput. Vis. Media 9(4), 733\u2013752 (2023)","journal-title":"Comput. Vis. Media"},{"key":"3165_CR34","doi-asserted-by":"crossref","unstructured":"Tang, C., Zhao, Y., Wang, G., et al.: MLP for image recognition: is self-attention really necessary?. In Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 36, issue 2, pp. 2344\u20132351 (2022)","DOI":"10.1609\/aaai.v36i2.20133"},{"key":"3165_CR35","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"3165_CR36","unstructured":"Hendrycks, D., Gimpel, K.: Gaussian error linear units (gelus) (2016) arXiv preprint arXiv:1606.08415"},{"key":"3165_CR37","unstructured":"Ba, J.L., Kiros, J.R., Hinton, G.E.: Layer normalization (2016) arXiv preprint arXiv:1607.06450"},{"key":"3165_CR38","doi-asserted-by":"crossref","unstructured":"Hao, Y., Zhang, H., Ngo, C.W., et al.: Group contextualization for video recognition. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 928938 (2022)","DOI":"10.1109\/CVPR52688.2022.00100"},{"key":"3165_CR39","doi-asserted-by":"crossref","unstructured":"Chen, J., Kao, S., He, H., et al.: Run, Don\u2019t Walk: Chasing higher FLOPS for faster neural networks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12021\u201312031 (2023)","DOI":"10.1109\/CVPR52729.2023.01157"},{"key":"3165_CR40","first-page":"195","volume-title":"International Workshop on Artificial Neural Networks","author":"J Han","year":"1995","unstructured":"Han, J., Moraga, C.: The influence of the sigmoid function parameters on the speed of backpropagation learning. In: International Workshop on Artificial Neural Networks, pp. 195\u2013201. Springer, Berlin Heidelberg (1995)"},{"key":"3165_CR41","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., et al.: HMDB: a large video database for human motion recognition. In 2011 International Conference on Computer Vision. IEEE, pp. 2556\u20132563 (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"3165_CR42","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., et al.: Imagenet: a large-scale hierarchical image database. In 2009 IEEE Conference on Computer Vision and Pattern Recognition. IEEE, pp. 248\u2013255 (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"3165_CR43","unstructured":"Hinton, G.E., Srivastava, N., Krizhevsky, A., et al.: Improving neural networks by preventing co-adaptation of feature detectors (2012) arXiv preprint arXiv:1207.0580"},{"key":"3165_CR44","doi-asserted-by":"crossref","unstructured":"Li, X., Wang, Y., Zhou, Z., et al.: Smallbignet: integrating core and contextual views for video classification. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1092\u20131101 (2020)","DOI":"10.1109\/CVPR42600.2020.00117"},{"key":"3165_CR45","unstructured":"Li, K., Li, X., Wang, Y., et al.: CT-net: channel tensorization network for video classification (2021) arXiv preprint arXiv:2106.01603"},{"key":"3165_CR46","doi-asserted-by":"crossref","unstructured":"Xie, Z., Chen, J., Wu, K., et al.: Global temporal difference network for action recognition. IEEE Trans. Multimed. (2022)","DOI":"10.1109\/TMM.2022.3224327"},{"key":"3165_CR47","doi-asserted-by":"crossref","unstructured":"Sudhakaran, S., Escalera, S., Lanz, O.: Gate-shift-fuse for video action recognition. In IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)","DOI":"10.1109\/TPAMI.2023.3268134"},{"key":"3165_CR48","doi-asserted-by":"publisher","first-page":"5484","DOI":"10.1109\/TIP.2022.3196175","volume":"31","author":"T Geng","year":"2022","unstructured":"Geng, T., Zheng, F., Hou, X., et al.: Spatial-temporal pyramid graph reasoning for action recognition. IEEE Trans. Image Process. 31, 5484\u20135497 (2022)","journal-title":"IEEE Trans. Image Process."},{"key":"3165_CR49","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., Cogswell, M., Das, A., et al.: Grad-cam: Visual explanations from deep networks via gradient-based localization. In Proceedings of the IEEE International Conference on Computer Vision, pp. 618\u2013626 (2017)","DOI":"10.1109\/ICCV.2017.74"},{"key":"3165_CR50","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., et al.: Delving deep into rectifiers: surpassing human-level performance on imagenet classification. In Proceedings of the IEEE International Conference on Computer Vision, pp. 1026\u20131034 (2015)","DOI":"10.1109\/ICCV.2015.123"},{"issue":"5","key":"3165_CR51","doi-asserted-by":"publisher","first-page":"2368","DOI":"10.1109\/TIP.2017.2787612","volume":"27","author":"W Wang","year":"2017","unstructured":"Wang, W., Shen, J.: Deep visual attention prediction. IEEE Trans. Image Process. 27(5), 2368\u20132378 (2017)","journal-title":"IEEE Trans. Image Process."},{"key":"3165_CR52","doi-asserted-by":"crossref","unstructured":"Diba, A., Fayyaz, M., Sharma, V., et al.: Spatio-temporal channel correlation networks for action classification. In Proceedings of the European Conference on Computer Vision (ECCV), pp. 284\u2013299 (2018)","DOI":"10.1007\/978-3-030-01225-0_18"},{"key":"3165_CR53","doi-asserted-by":"crossref","unstructured":"Li, Y., Ji, B., Shi, X., et al.: Tea: Temporal excitation and aggregation for action recognition. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 909\u2013918 (2020)","DOI":"10.1109\/CVPR42600.2020.00099"},{"key":"3165_CR54","doi-asserted-by":"publisher","first-page":"7279","DOI":"10.1109\/TIP.2022.3221292","volume":"31","author":"Y Hao","year":"2022","unstructured":"Hao, Y., Wang, S., Tan, Y., et al.: Spatio-temporal collaborative module for efficient action recognition. IEEE Trans. Image Process. 31, 7279\u20137291 (2022)","journal-title":"IEEE Trans. Image Process."},{"issue":"9","key":"3165_CR55","doi-asserted-by":"publisher","first-page":"6529","DOI":"10.1007\/s00521-022-08040-4","volume":"35","author":"W Gong","year":"2023","unstructured":"Gong, W., Qian, Y., Fan, Y.: MPCSAN: multi-head parallel channel-spatial attention network for facial expression recognition in the wild. Neural Comput. Appl. 35(9), 6529\u20136543 (2023)","journal-title":"Neural Comput. Appl."}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-023-03165-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-023-03165-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-023-03165-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,13]],"date-time":"2024-08-13T15:17:30Z","timestamp":1723562250000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-023-03165-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,7]]},"references-count":55,"journal-issue":{"issue":"9","published-print":{"date-parts":[[2024,9]]}},"alternative-id":["3165"],"URL":"https:\/\/doi.org\/10.1007\/s00371-023-03165-6","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,12,7]]},"assertion":[{"value":"29 October 2023","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 December 2023","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The author states that there are no competing interests relating to the publication of this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}