{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,28]],"date-time":"2026-03-28T04:41:40Z","timestamp":1774672900682,"version":"3.50.1"},"reference-count":56,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61771420"],"award-info":[{"award-number":["61771420"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62001413"],"award-info":[{"award-number":["62001413"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003787","name":"Natural Science Foundation of Hebei Province","doi-asserted-by":"publisher","award":["F2020203064"],"award-info":[{"award-number":["F2020203064"]}],"id":[{"id":"10.13039\/501100003787","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003787","name":"Natural Science Foundation of Hebei Province","doi-asserted-by":"publisher","award":["F2024203069"],"award-info":[{"award-number":["F2024203069"]}],"id":[{"id":"10.13039\/501100003787","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Science and Technology Project of Hebei Education Department","award":["BJK2023117"],"award-info":[{"award-number":["BJK2023117"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1007\/s10489-025-06742-5","type":"journal-article","created":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T01:36:06Z","timestamp":1751333766000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Dual cross transformer based on multi-scale fusion for fine-grained action recognition"],"prefix":"10.1007","volume":"55","author":[{"given":"Jirui","family":"Di","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0300-6144","authenticated-orcid":false,"given":"Zhengping","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Hehao","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Qiming","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Zhe","family":"Sun","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,7,1]]},"reference":[{"key":"6742_CR1","unstructured":"Soomro K, Zamir AR, Shah M (2012) Ucf101: a dataset of 101 human action classes from videos in the wild. arXiv:1212.0402"},{"key":"6742_CR2","doi-asserted-by":"crossref","unstructured":"Kuehne H, Jhuang H, Garrote E, Poggio T, Serre T (2011) HMDB: a large video database for human motion recognition. In: Proc. IEEE international conference on computer vision (ICCV), pp 2556\u20132563","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"6742_CR3","unstructured":"Kay W, Carreira J, Simonyan K, Zhang B, Hillier C, Vijayanarasimhan S, Viola F, Green T, Back T, Natsev P et al (2017) The kinetics human action video dataset. arXiv:1705.06950"},{"key":"6742_CR4","unstructured":"Jiang YG, Liu J, Zamir AR, Toderici G, Laptev I, Shah M, Sukthankar R (2014) Thumos challenge: action recognition with a large number of classes. In: ECCV workshop"},{"key":"6742_CR5","doi-asserted-by":"crossref","unstructured":"Heilbron FC, Escorcia V, Ghanem B, Niebles JC (2015) Activitynet: a laege-scale video benchmark for human activity understanding. In: Proc. IEEE Conference on Computer Vision and Pattern Recognition(CVPR), pp 961\u2013970","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"6742_CR6","doi-asserted-by":"crossref","unstructured":"Gu C et al (2018) AVA: a video dataset of spatio-temporally localized atomic visual actions. In: Proc. IEEE\/CVF conference on Computer Vision and Pattern Recognition (CVPR), pp 6047\u20136056","DOI":"10.1109\/CVPR.2018.00633"},{"key":"6742_CR7","doi-asserted-by":"crossref","unstructured":"Shao D, Zhao Y, Dai B, Lin D (2020) Finegym: a hierarchical video dataset for fine-grained action understanding. In: Proc. IEEE conference on Computer Vision and Pattern Recognition (CVPR), pp 2616\u20132625","DOI":"10.1109\/CVPR42600.2020.00269"},{"key":"6742_CR8","doi-asserted-by":"crossref","unstructured":"Li Y, Li Y, Vasconcelos N (2018) Resound: towards action recognition without representation bias. In: Proc. European Conference on Computer Vision(ECCV), pp 520\u2013535","DOI":"10.1007\/978-3-030-01231-1_32"},{"issue":"11","key":"6742_CR9","doi-asserted-by":"publisher","first-page":"2740","DOI":"10.1109\/TPAMI.2018.2868668","volume":"41","author":"L Wang","year":"2019","unstructured":"Wang L, Xiong Y, Wang Z, Qiao Y, Lin D, Tang X, Van Gool L (2019) Temporal segment networks for action recognition in videos. IEEE Trans Pattern Anal Mach Intell 41(11):2740\u20132755","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"6742_CR10","doi-asserted-by":"crossref","unstructured":"Zhou B, Andonian A, Oliva A, Torralba A (2018) Temporal relational reasoning in videos. In: Proc. European Conference on Computer Vision(ECCV), pp 803\u2013818","DOI":"10.1007\/978-3-030-01246-5_49"},{"key":"6742_CR11","doi-asserted-by":"crossref","unstructured":"Carreira J, Zisserman A (2017) Quo vadis, action recognition? A new model and the kinetics dataset. In: Proc. IEEE Conference on Computer Vision and Pattern Recognition(CVPR), pp 6299\u20136308","DOI":"10.1109\/CVPR.2017.502"},{"key":"6742_CR12","doi-asserted-by":"crossref","unstructured":"Feichtenhofer C, Fan H, Malik J, He K (2019) Slowfast networks for video recognition. In: Proc. IEEE international conference on computer vision(ICCV), pp 6202\u20136211","DOI":"10.1109\/ICCV.2019.00630"},{"key":"6742_CR13","unstructured":"Bertasius G, Wang H, Torresani L (2021) Is space-time attention all you need for video understanding? arXiv:2102.05095"},{"issue":"10","key":"6742_CR14","doi-asserted-by":"publisher","first-page":"12581","DOI":"10.1109\/TPAMI.2023.3282631","volume":"45","author":"K Li","year":"2023","unstructured":"Li K, Wang Y et al (2023) UniFormer: unifying convolution and self-attention for visual recognition. IEEE Trans Pattern Anal Mach Intell 45(10):12581\u201312600","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"6742_CR15","doi-asserted-by":"crossref","unstructured":"Arnab A, Dehghani M, Heigold G (2021) Vivit: a video vision transformer. In: Proc. IEEE\/CVF International Conference on Computer Vision(ICCV), pp 6836\u20136846","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"6742_CR16","doi-asserted-by":"crossref","unstructured":"Neimark D, Bar O, Zohar M, Asselmann D (2021) Video transformer network. arXiv:2102.00719","DOI":"10.1109\/ICCVW54120.2021.00355"},{"key":"6742_CR17","unstructured":"Patrick M, Campbell D et al (2021) Keeping your eye on the ball: trajectory attention in video transformers. Neural information processing systems vol 34, pp 12493\u201312506"},{"key":"6742_CR18","doi-asserted-by":"crossref","unstructured":"Fan H, Xiong B, Mangalam K, Li Y, Yan Z, Malik JA, Feichtenhofer C (2021) Multiscale vision transformers. In: Proc. IEEE international conference on computer vision, pp 6824\u20136835","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"6742_CR19","doi-asserted-by":"crossref","unstructured":"Yang C, Xu Y, Shi J, Dai B, Zhou B (2020) Temporal pyramid network for action recognition. In: Proc. IEEE conference on Computer Vision and Pattern Recognition(CVPR), pp 591\u2013600","DOI":"10.1109\/CVPR42600.2020.00067"},{"key":"6742_CR20","unstructured":"Simonyan K, Zisserman A (2014) Two-stream convolutional networks for action recognition in videos. In: Advances in neural information processing systems, pp 568\u2013576"},{"key":"6742_CR21","doi-asserted-by":"crossref","unstructured":"Feichtenhofer C, Pinz A, Zisserman A (2016) Convolutional two-stream network fusion for video action recognition. In: Proc. IEEE conference on computer vision and pattern recognition(CVPR), pp 1933\u20131941","DOI":"10.1109\/CVPR.2016.213"},{"key":"6742_CR22","doi-asserted-by":"crossref","unstructured":"Lin J, Gan C, Han S (2019) Tsm: temporal shift module for efficient video understanding. In: Proc. IEEE International Conference on Computer Vision(ICCV), pp 7083\u20137093","DOI":"10.1109\/ICCV.2019.00718"},{"key":"6742_CR23","doi-asserted-by":"crossref","unstructured":"Liu Z, Luo D, Wang Y, Wang L, Tai Y, Wang C, Li J, Huang F, Lu T (2020) Teinet: towards an effificient architecture for video recognition. In: Proc. AAAI conference on artifificial intelligence, vol. 34, no. 07, pp 11669\u201311676","DOI":"10.1609\/aaai.v34i07.6836"},{"key":"6742_CR24","doi-asserted-by":"crossref","unstructured":"Luo C, Yuille AL (2019) Grouped spatial-temporal aggregation for efficient action recognition. In: Proc. IEEE International Conference on Computer Vision(ICCV), pp 5512\u20135521","DOI":"10.1109\/ICCV.2019.00561"},{"key":"6742_CR25","doi-asserted-by":"crossref","unstructured":"Li Y, Ji B, Shi X, Zhang J, Kang B, Wang L (2020) Tea: temporalexcitation and aggregation for action recognition. In: Proc. IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 909\u2013918","DOI":"10.1109\/CVPR42600.2020.00099"},{"key":"6742_CR26","doi-asserted-by":"crossref","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M (2015) Learning spatiotemporal features with 3d convolutional networks. In: Proc. IEEE international conference on computer vision(ICCV), pp 4489\u20134497","DOI":"10.1109\/ICCV.2015.510"},{"key":"6742_CR27","unstructured":"Xie S, Sun C, Huang J, Tu Z, Murphy K (2017) Rethinking spatiotemporal feature learning for video understanding. arXiv:1712.04851"},{"key":"6742_CR28","doi-asserted-by":"crossref","unstructured":"Tran D, Wang H, Torresani L, Ray J, LeCun Y, Paluri M (2018) A closer look at spatiotemporal convolutions for action recognition. In: Proc. IEEE conference on Computer Vision and Pattern Recognition(CVPR), pp 6450\u20136459","DOI":"10.1109\/CVPR.2018.00675"},{"key":"6742_CR29","unstructured":"Devlin J, Chang M, Lee K, Toutanova K (2019) Bert: pre-training of deep bidirectional transformers for language understanding. In: Proc. naacL-HLT, vol 1, p 2"},{"key":"6742_CR30","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN et al (2017) Attention is all you need. In: Proc. Advances in neural information processing systems, vol 30"},{"key":"6742_CR31","doi-asserted-by":"crossref","unstructured":"Bello I, Zoph B, Vaswani A, Shlens J, Le QV (2019) Attention augmented convolutional networks. In: Proc. IEEE International Conference on Computer Vision(ICCV), pp 3286\u20133295","DOI":"10.1109\/ICCV.2019.00338"},{"key":"6742_CR32","unstructured":"Dai Z, Liu H, Le Q, Tan M (2021) Coatnet: marrying convolution and attention for all data sizes. Advances in neural information processing systems, vol 34, pp 3965\u20133977"},{"key":"6742_CR33","doi-asserted-by":"crossref","unstructured":"Wang X, Girshick R, Gupta A, He K (2018) Non-local neural networks. In: Proc. IEEE conference on Computer Vision and Pattern Recognition(CVPR), pp 7794\u20137803","DOI":"10.1109\/CVPR.2018.00813"},{"key":"6742_CR34","doi-asserted-by":"crossref","unstructured":"Yao T, Li Y, Pan Y et al (2023) Dual vision transformer. IEEE Trans Pattern Anal Mach Intell","DOI":"10.1109\/TPAMI.2023.3268446"},{"key":"6742_CR35","doi-asserted-by":"crossref","unstructured":"Lin T, Doll\u00e1r P, Girshick R, He K, Hariharan B, Belongie S (2017) Feature pyramid networks for object detection. In: Proc. IEEE conference on Computer Vision and Pattern Recognition (CVPR), pp 2117\u20132125","DOI":"10.1109\/CVPR.2017.106"},{"key":"6742_CR36","unstructured":"Ba JL, Kiros JR, Hinton GE (2016) Layer normalization. arXiv:1607.06450"},{"key":"6742_CR37","doi-asserted-by":"crossref","unstructured":"Truong T-D et al (2022) DirecFormer: a directed attention in transformer approach to robust action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 19998\u201320008","DOI":"10.1109\/CVPR52688.2022.01940"},{"key":"6742_CR38","unstructured":"Zhao Y, Xiong Y, Lin D (2023) Mmaction. https:\/\/github.com\/open-mmlab\/mmaction"},{"key":"6742_CR39","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.108970","volume":"133","author":"Y Zhou","year":"2023","unstructured":"Zhou Y, Huang Z, Yang X, Ang M, Ng TK (2023) GCM: efficient video recognition with glance and combine module. Pattern Recognit 133:108970","journal-title":"Pattern Recognit"},{"key":"6742_CR40","unstructured":"Kim M, Kwon H, Wang C et al (2021) Relational self-attention: what\u2019s missing in attention for video understanding. Advances in neural information processing systems, vol 34, pp 8046\u20138059"},{"key":"6742_CR41","doi-asserted-by":"crossref","unstructured":"Xiang W, Li C, Wang B, Wei X, Hua X, Zhang L (2022) Spatiotemporal self-attention modeling with temporal patch shift for action recognition. In: Proc. European Conference on Computer Vision(ECCV), pp 627\u2013644","DOI":"10.1007\/978-3-031-20062-5_36"},{"key":"6742_CR42","doi-asserted-by":"crossref","unstructured":"Wang J, Torresani L (2022) Deformable video transformer. In: Proc. IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 14056\u201314062","DOI":"10.1109\/CVPR52688.2022.01366"},{"key":"6742_CR43","doi-asserted-by":"crossref","unstructured":"Sun B, Ye X, Wang Z, Li H, Wang Z (2023) Exploring coarse-to-fine action token localization and interaction for fine-grained video action recognition. In: Proceedings of the 31st ACM international conference on multimedia, pp 5070\u20135078","DOI":"10.1145\/3581783.3612206"},{"key":"6742_CR44","doi-asserted-by":"crossref","unstructured":"Dai J, Qi H, Xiong Y, Li Y, Zhang G, Hu H, Wei Y (2017) Deformable convolutional networks. In: Proc. IEEE international conference on computer vision(ICCV), pp 764\u2013773","DOI":"10.1109\/ICCV.2017.89"},{"key":"6742_CR45","unstructured":"Yu F, Koltun V (2015) Multi-scale context aggregation by dilated convolutions. arXiv:1511.07122"},{"key":"6742_CR46","doi-asserted-by":"crossref","unstructured":"Yu F, Koltun V, Funkhouser T (2017) Dilated residual networks. In: Proc. IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 472\u2013480","DOI":"10.1109\/CVPR.2017.75"},{"key":"6742_CR47","doi-asserted-by":"crossref","unstructured":"Hu J, Shen L, Sun G (2018) Squeeze-and-excitation networks. In: Proc. IEEE Conference on Computer Vision and Pattern Recognition(CVPR), pp 7132\u20137141","DOI":"10.1109\/CVPR.2018.00745"},{"key":"6742_CR48","unstructured":"Hu J, Shen L, Albanie S, Sun G, Vedaldi A (2018) Gather-excite: exploiting feature context in convolutional neural networks. arXiv:1810.12348"},{"key":"6742_CR49","unstructured":"Raghu M, Unterthiner T, Kornblith S et al (2021) Do vision transformers see like convolutional neural networks. Advances in neural information processing systems, vol 34, pp 12116\u201312128"},{"key":"6742_CR50","doi-asserted-by":"crossref","unstructured":"Carion N, Massa F, Synnaeve G, Usunier N, Kirillov A, Zagoruyko S (2020) End to-end object detection with transformers. In: Proc. European Conference on Computer Vision (ECCV), pp 213\u2013229","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"6742_CR51","unstructured":"Zhu X, Su W, Lu L, Li B, Wang X, Dai J (2020) Deformable detr: deformable transformers for end-to-end object detection. arXiv:2010.04159"},{"key":"6742_CR52","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2022.104589","volume":"129","author":"J Shang","year":"2023","unstructured":"Shang J, Wei P, Li H et al (2023) Multi-scale interaction transformer for temporal action proposal generation. Image Vision Comput 129:104589","journal-title":"Image Vision Comput"},{"key":"6742_CR53","doi-asserted-by":"crossref","unstructured":"Wang R, Chen D, Wu Z et al (2022) Bevt: Bert pretraining of video transformers. In: Proc. IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 14733\u201314743","DOI":"10.1109\/CVPR52688.2022.01432"},{"key":"6742_CR54","doi-asserted-by":"crossref","unstructured":"Herzig R, Ben-Avraham E, Mangalam K et al (2022) Object-region video transformers. In: Proc. IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 3148\u20133159","DOI":"10.1109\/CVPR52688.2022.00315"},{"key":"6742_CR55","doi-asserted-by":"crossref","unstructured":"Steck H, Ekanadham C, Kallus N (2024) Is cosine-similarity of embeddings really about similarity? Companion proceedings of the ACM web conference, pp 887\u2013890","DOI":"10.1145\/3589335.3651526"},{"key":"6742_CR56","doi-asserted-by":"crossref","unstructured":"Abaddi S (2025) Q-Omni: a quantum computing and GPT-4o solution for camel-vehicle collisions. Int J Transp Sci Technol","DOI":"10.1016\/j.ijtst.2025.02.002"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-025-06742-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-025-06742-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-025-06742-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,19]],"date-time":"2025-09-19T15:56:09Z","timestamp":1758297369000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-025-06742-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,1]]},"references-count":56,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2025,8]]}},"alternative-id":["6742"],"URL":"https:\/\/doi.org\/10.1007\/s10489-025-06742-5","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"value":"0924-669X","type":"print"},{"value":"1573-7497","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,7,1]]},"assertion":[{"value":"17 June 2025","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 July 2025","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing Interests"}}],"article-number":"832"}}