{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,3]],"date-time":"2025-05-03T06:40:10Z","timestamp":1746254410955,"version":"3.40.4"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T00:00:00Z","timestamp":1742860800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T00:00:00Z","timestamp":1742860800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Real-Time Image Proc"],"published-print":{"date-parts":[[2025,4]]},"DOI":"10.1007\/s11554-025-01662-6","type":"journal-article","created":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T18:23:59Z","timestamp":1743099839000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Stme-net: spatio-temporal motion excitation network for action recognition"],"prefix":"10.1007","volume":"22","author":[{"given":"Qian","family":"Zhao","sequence":"first","affiliation":[]},{"given":"Yanxiong","family":"Su","sequence":"additional","affiliation":[]},{"given":"Hui","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,3,25]]},"reference":[{"unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. Adv. Neural Inf. Process. Syst. 27 (2014)","key":"1662_CR1"},{"doi-asserted-by":"crossref","unstructured":"Goyal, R., Ebrahimi\u00a0Kahou, S., Michalski, V., Materzynska, J., Westphal, S., Kim, H., Haenel, V., Fruend, I., Yianilos, P., Mueller-Freitag, M., et\u00a0al.: The \u201csomething something\u201d video database for learning and evaluating visual common sense. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 842\u20135850 (2017)","key":"1662_CR2","DOI":"10.1109\/ICCV.2017.622"},{"doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. Ieee (2009)","key":"1662_CR3","DOI":"10.1109\/CVPR.2009.5206848"},{"doi-asserted-by":"crossref","unstructured":"Wang, L., Xiong, Y., Wang, Z., Qiao, Y., Lin, D., Tang, X., Van\u00a0Gool, L.: Temporal segment networks: towards good practices for deep action recognition. In: European Conference on Computer Vision, pp. 20\u201336. Springer (2016)","key":"1662_CR4","DOI":"10.1007\/978-3-319-46484-8_2"},{"doi-asserted-by":"crossref","unstructured":"Lin, J., Gan, C., Han, S.: Tsm: Temporal shift module for efficient video understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7083\u20137093 (2019)","key":"1662_CR5","DOI":"10.1109\/ICCV.2019.00718"},{"doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: Slowfast networks for video recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6202\u20136211 (2019)","key":"1662_CR6","DOI":"10.1109\/ICCV.2019.00630"},{"doi-asserted-by":"crossref","unstructured":"Xie, S., Sun, C., Huang, J., Tu, Z., Murphy, K.: Rethinking spatiotemporal feature learning: Speed-accuracy trade-offs in video classification. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 305\u2013321 (2018)","key":"1662_CR7","DOI":"10.1007\/978-3-030-01267-0_19"},{"doi-asserted-by":"crossref","unstructured":"Luo, C., Yuille, A.L.: Grouped spatial-temporal aggregation for efficient action recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5512\u20135521 (2019)","key":"1662_CR8","DOI":"10.1109\/ICCV.2019.00561"},{"doi-asserted-by":"crossref","unstructured":"Sudhakaran, S., Escalera, S., Lanz, O.: Gate-shift networks for video action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1102\u20131111 (2020)","key":"1662_CR9","DOI":"10.1109\/CVPR42600.2020.00118"},{"doi-asserted-by":"crossref","unstructured":"Jiang, B., Wang, M.M., Gan, W., Wu, W., Yan, J.: Stm: Spatiotemporal and motion encoding for action recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2000\u20132009 (2019)","key":"1662_CR10","DOI":"10.1109\/ICCV.2019.00209"},{"doi-asserted-by":"crossref","unstructured":"Ullah, M., Yamin, M.M., Mohammed, A.K., Khan, S.D., Ullah, H., Cheikh, FA.: Attention-based lstm network for action recognition in sports. In: IRIACV (2021)","key":"1662_CR11","DOI":"10.2352\/ISSN.2470-1173.2021.6.IRIACV-302"},{"doi-asserted-by":"crossref","unstructured":"Shao, H., Qian, S., Liu, Yu.: Temporal interlacing network. In: Proceedings of the AAAI Conference on Artificial Intelligence 34, pp. 11966\u201311973 (2020)","key":"1662_CR12","DOI":"10.1609\/aaai.v34i07.6872"},{"doi-asserted-by":"crossref","unstructured":"Li, C., Zhong, Q., Xie, D., Pu, S.: Collaborative spatiotemporal feature learning for video action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vsion and Pattern Recognition, pp. 7872\u20137881 (2019)","key":"1662_CR13","DOI":"10.1109\/CVPR.2019.00806"},{"unstructured":"Li, K., Li, X., Wang, Y., Wang, J., Qiao, Y.: Ct-net: Channel tensorization network for video classification. arXiv preprint arXiv:2106.01603 (2021)","key":"1662_CR14"},{"unstructured":"Wenhao, W., He, D., Tianwei Lin, F., Li, C.G., Ding, E.: Multi-view fusion network for efficient video recognition, Mvfnet (2021)","key":"1662_CR15"},{"doi-asserted-by":"crossref","unstructured":"Lee, M., Lee, S., Son, S., Park, G., Kwak, N.: Motion feature network: Fixed motion filter for action recognition. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 387\u2013403 (2018)","key":"1662_CR16","DOI":"10.1007\/978-3-030-01249-6_24"},{"doi-asserted-by":"crossref","unstructured":"Liu, Z., Luo, D., Wang, Y., Wang, L., Tai, Y., Wang, C., Li, J., Huang, F., Tong, L.: Teinet: Towards an efficient architecture for video recognition. In: Proceedings of the AAAI Conference on Artificial Intelligence 34, pp. 11669\u201311676 (2020)","key":"1662_CR17","DOI":"10.1609\/aaai.v34i07.6836"},{"doi-asserted-by":"crossref","unstructured":"Li, Y., Ji, B., Shi, X., Zhang, J., Kang, B., Wang, L.: Tea: Temporal excitation and aggregation for action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 909\u2013918 (2020)","key":"1662_CR18","DOI":"10.1109\/CVPR42600.2020.00099"},{"doi-asserted-by":"crossref","unstructured":"Liu, Z., Wang, L., Wu, W., Qian, C., Lu, T.: Tam: Temporal adaptive module for video recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13708\u201313718 (2021)","key":"1662_CR19","DOI":"10.1109\/ICCV48922.2021.01345"},{"doi-asserted-by":"crossref","unstructured":"Wang, Z., She, Q., Smolic, A.: Action-net: Multipath excitation for action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern recognition, pp. 13214\u201313223 (2021)","key":"1662_CR20","DOI":"10.1109\/CVPR46437.2021.01301"},{"doi-asserted-by":"crossref","unstructured":"Wang, B., Liu, C., Chang, F., Wang, W., Li, N.: Ae-net: adjoint enhancement network for efficient action recognition in video understanding. IEEE Transactions on Multimedia (2022)","key":"1662_CR21","DOI":"10.1109\/TMM.2022.3193057"},{"doi-asserted-by":"crossref","unstructured":"Sudhakaran, S., Escalera, S., Lanz, O.: Gate-shift-fuse for video action recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)","key":"1662_CR22","DOI":"10.1109\/TPAMI.2023.3268134"},{"doi-asserted-by":"crossref","unstructured":"Gong, G., Zheng, L., Mu, Y.: Scale matters: Temporal scale aggregation network for precise action localization in untrimmed videos. In: 2020 IEEE International Conference on Multimedia and Expo (ICME), pp. 1\u20136. IEEE (2020)","key":"1662_CR23","DOI":"10.1109\/ICME46284.2020.9102850"},{"doi-asserted-by":"crossref","unstructured":"Wang, L., Tong, Z., Ji, B., Wu, G.: Tdn: Temporal difference networks for efficient action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1895\u20131904 (2021)","key":"1662_CR24","DOI":"10.1109\/CVPR46437.2021.00193"},{"doi-asserted-by":"crossref","unstructured":"Hara, K., Kataoka, H., Satoh, Y.: Can spatiotemporal 3d cnns retrace the history of 2d cnns and imagenet? In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6546\u20136555 (2018)","key":"1662_CR25","DOI":"10.1109\/CVPR.2018.00685"},{"doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C.: X3d: Expanding architectures for efficient video recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 203\u2013213 (2020)","key":"1662_CR26","DOI":"10.1109\/CVPR42600.2020.00028"},{"doi-asserted-by":"crossref","unstructured":"Chen, Y., Kalantidis, Y., Li, J., Yan, S., Feng, J.: Multi-fiber networks for video recognition. In: Proceedings of the European Conference on Computer Vsion (ECCV), pp. 352\u2013367 (2018)","key":"1662_CR27","DOI":"10.1007\/978-3-030-01246-5_22"},{"doi-asserted-by":"crossref","unstructured":"Yang, C., Xu, Y., Shi, J., Dai, B., Zhou, B.: Temporal pyramid network for action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 591\u2013600 (2020)","key":"1662_CR28","DOI":"10.1109\/CVPR42600.2020.00067"},{"doi-asserted-by":"crossref","unstructured":"Dai, R., Das, S., Kahatapitiya, K., Ryoo, M.S., Br\u00e9mond, F.: Ms-tct: multi-scale temporal convtransformer for action detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 20041\u201320051 (2022)","key":"1662_CR29","DOI":"10.1109\/CVPR52688.2022.01941"},{"doi-asserted-by":"crossref","unstructured":"Truong, T.-D., Bui, Q.-H., Duong, C.N., Seo, H.-S., Phung, S.L., Li, X., Luu, K.: Direcformer: A directed attention in transformer approach to robust action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 20030\u201320040 (2022)","key":"1662_CR30","DOI":"10.1109\/CVPR52688.2022.01940"},{"doi-asserted-by":"crossref","unstructured":"Ranasinghe, K., Naseer, M., Khan, S., Khan, F.S., Ryoo, M.S.: Self-supervised video transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2874\u20132884 (2022)","key":"1662_CR31","DOI":"10.1109\/CVPR52688.2022.00289"},{"doi-asserted-by":"crossref","unstructured":"Lin, W., Mirza, M.J., Kozinski, M., Possegger, H., Kuehne, H., Bischof, H.: Video test-time adaptation for action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22952\u201322961 (2023)","key":"1662_CR32","DOI":"10.1109\/CVPR52729.2023.02198"},{"unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et\u00a0al.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)","key":"1662_CR33"},{"doi-asserted-by":"crossref","unstructured":"Qing, Z., Zhang, S., Huang, Z., Wang, X., Wang, Y., Lv, Y., Gao, C., Sang, N.: Mar: Masked autoencoders for efficient action recognition. IEEE Transactions on Multimedia (2023)","key":"1662_CR34","DOI":"10.1109\/TMM.2023.3263288"},{"doi-asserted-by":"crossref","unstructured":"Hao, Y., Zhang, H., Ngo, C.-W., He, X.: Group contextualization for video recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 928\u2013938 (2022)","key":"1662_CR35","DOI":"10.1109\/CVPR52688.2022.00100"},{"doi-asserted-by":"crossref","unstructured":"Zhou, B., Andonian, A., Oliva, A., Torralba, A.: Temporal relational reasoning in videos. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 803\u2013818 (2018)","key":"1662_CR36","DOI":"10.1007\/978-3-030-01246-5_49"},{"unstructured":"Zhao, Y., Xiong, Y., Lin, D.: Trajectory convolution for action recognition. Adv. Neural Inf. Process. Syst., 31 (2018)","key":"1662_CR37"},{"doi-asserted-by":"crossref","unstructured":"Fan, L., Buch, S., Wang, G., Cao, R., Zhu, Y., Niebles, J.C., Fei-Fei, L.: Rubiksnet: Learnable 3d-shift for efficient video action recognition. In: European Conference on Computer Vision, pp. 505\u2013521. Springer (2020)","key":"1662_CR38","DOI":"10.1007\/978-3-030-58529-7_30"},{"doi-asserted-by":"crossref","unstructured":"Zhu, X., Xu, C., Hui, L., Lu, C., Tao, D.: Approximated bilinear modules for temporal modeling. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3494\u20133503 (2019)","key":"1662_CR39","DOI":"10.1109\/ICCV.2019.00359"},{"unstructured":"Fan, Q., Chen, C.-F.R., Kuehne, H., Pistoia, M., Cox, D.: More is less: Learning efficient video representations by big-little network and depthwise temporal aggregation. Adv. Neural Inf. Process. Syst., 32 (2019)","key":"1662_CR40"},{"doi-asserted-by":"crossref","unstructured":"Wang, X., Gupta, A.: Videos as space-time region graphs. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 399\u2013417 (2018)","key":"1662_CR41","DOI":"10.1007\/978-3-030-01228-1_25"},{"doi-asserted-by":"crossref","unstructured":"Wang, H., Tran, D., Torresani, L., Feiszli, M.: Video modeling with correlation networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 352\u2013361 (2020)","key":"1662_CR42","DOI":"10.1109\/CVPR42600.2020.00043"},{"doi-asserted-by":"crossref","unstructured":"Li, X., Wang, Y., Zhou, Z., Qiao, Y.: Smallbignet: Integrating core and contextual views for video classification. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1092\u20131101 (2020)","key":"1662_CR43","DOI":"10.1109\/CVPR42600.2020.00117"},{"unstructured":"Zhang, S., Guo, S., Huang, W., Scott, M.R., Wang, L.: V4d: 4d convolutional neural networks for video-level representation learning. arXiv preprint arXiv:2002.07442 (2020)","key":"1662_CR44"},{"doi-asserted-by":"crossref","unstructured":"Liu, X., Lee, J.-Y., Jin, H.: Learning video representations from correspondence proposals. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4273\u20134281 (2019)","key":"1662_CR45","DOI":"10.1109\/CVPR.2019.00440"},{"doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? a new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308 (2017)","key":"1662_CR46","DOI":"10.1109\/CVPR.2017.502"},{"issue":"18","key":"1662_CR47","doi-asserted-by":"publisher","first-page":"14593","DOI":"10.1007\/s00521-020-05144-7","volume":"32","author":"Z Liu","year":"2020","unstructured":"Liu, Z., Li, Z., Wang, R., Zong, M., Ji, W.: Spatiotemporal saliency-based multi-stream networks with attention-aware lstm for action recognition. Neural Comput. Appl. 32(18), 14593\u201314602 (2020)","journal-title":"Neural Comput. Appl."},{"doi-asserted-by":"crossref","unstructured":"Sun, S., Kuang, Z., Sheng, L., Ouyang, W., Zhang, W.: Optical flow guided feature: A fast and robust motion representation for video action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1390\u20131399 (2018)","key":"1662_CR48","DOI":"10.1109\/CVPR.2018.00151"},{"doi-asserted-by":"crossref","unstructured":"Zolfaghari, M., Singh, K., Brox, T.: Eco: Efficient convolutional network for online video understanding. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 695\u2013712 (2018)","key":"1662_CR49","DOI":"10.1007\/978-3-030-01216-8_43"}],"container-title":["Journal of Real-Time Image Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11554-025-01662-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11554-025-01662-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11554-025-01662-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,3]],"date-time":"2025-05-03T06:23:00Z","timestamp":1746253380000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11554-025-01662-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,25]]},"references-count":49,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,4]]}},"alternative-id":["1662"],"URL":"https:\/\/doi.org\/10.1007\/s11554-025-01662-6","relation":{},"ISSN":["1861-8200","1861-8219"],"issn-type":[{"type":"print","value":"1861-8200"},{"type":"electronic","value":"1861-8219"}],"subject":[],"published":{"date-parts":[[2025,3,25]]},"assertion":[{"value":"20 November 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 February 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 March 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}}],"article-number":"88"}}