{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,2]],"date-time":"2025-10-02T23:41:33Z","timestamp":1759448493295,"version":"build-2065373602"},"reference-count":43,"publisher":"Springer Science and Business Media LLC","issue":"34","license":[{"start":{"date-parts":[[2025,4,15]],"date-time":"2025-04-15T00:00:00Z","timestamp":1744675200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,4,15]],"date-time":"2025-04-15T00:00:00Z","timestamp":1744675200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-025-20827-w","type":"journal-article","created":{"date-parts":[[2025,4,15]],"date-time":"2025-04-15T07:01:42Z","timestamp":1744700502000},"page":"42425-42444","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Dual-branch vision transformer for low-resolution action recognition"],"prefix":"10.1007","volume":"84","author":[{"given":"Ruixin","family":"Chen","sequence":"first","affiliation":[]},{"given":"Chenqiang","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Zhuolin","family":"Tan","sequence":"additional","affiliation":[]},{"given":"Fangxin","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Jiayi","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Xinlin","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,4,15]]},"reference":[{"key":"20827_CR1","doi-asserted-by":"crossref","unstructured":"Feichtenhofer C, Fan H, Malik J, He K (2019) Slowfast networks for video recognition. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 6202\u20136211","DOI":"10.1109\/ICCV.2019.00630"},{"key":"20827_CR2","doi-asserted-by":"crossref","unstructured":"Feichtenhofer C (2020) X3d: Expanding architectures for efficient video recognition. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 203\u2013213","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"20827_CR3","doi-asserted-by":"crossref","unstructured":"Pan J, Chen S, Shou MZ, Liu Y, Shao J, Li H (2021) Actor-context-actor relation network for spatio-temporal action localization. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 464\u2013474","DOI":"10.1109\/CVPR46437.2021.00053"},{"key":"20827_CR4","unstructured":"Bertasius G, Wang H, Torresani L (2021) Is space-time attention all you need for video understanding? In: ICML, vol 2, p 4"},{"key":"20827_CR5","doi-asserted-by":"crossref","unstructured":"Fan H, Xiong B, Mangalam K, Li Y, Yan Z, Malik J, Feichtenhofer C (2021) Multiscale vision transformers. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 6824\u20136835","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"20827_CR6","doi-asserted-by":"crossref","unstructured":"He K, Chen X, Xie S, Li Y, Doll\u00e1r P, Girshick R (2022) Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 16000\u201316009","DOI":"10.1109\/CVPR52688.2022.01553"},{"issue":"11","key":"20827_CR7","first-page":"1","volume":"2","author":"K Soomro","year":"2012","unstructured":"Soomro K, Zamir AR, Shah M (2012) A dataset of 101 human action classes from videos in the wild. Center Res Comput Vis 2(11):1\u20137","journal-title":"Center Res Comput Vis"},{"key":"20827_CR8","doi-asserted-by":"crossref","unstructured":"Kuehne H, Jhuang H, Garrote E, Poggio T, Serre T (2011) Hmdb: a large video database for human motion recognition. In: 2011 international conference on computer vision, IEEE, pp 2556\u20132563","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"20827_CR9","doi-asserted-by":"crossref","unstructured":"Carreira J, Zisserman A (2017) Quo vadis, action recognition? a new model and the kinetics dataset. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6299\u20136308","DOI":"10.1109\/CVPR.2017.502"},{"key":"20827_CR10","doi-asserted-by":"crossref","unstructured":"Sigurdsson GA, Varol G, Wang X, Farhadi A, Laptev I, Gupta A (2016) Hollywood in homes: Crowdsourcing data collection for activity understanding. In: Computer vision\u2013ECCV 2016: 14th European conference, Amsterdam, the Netherlands, October 11\u201314, 2016, Proceedings, Part I 14, Springer, pp 510\u2013526","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"20827_CR11","unstructured":"Chen B, Qiao Y, Wang Y (2022) Low-resolution action recognition for tiny actions challenge. arXiv preprint arXiv:2209.14711"},{"key":"20827_CR12","unstructured":"He J, Zhang Z, Xu Z, Luo Z (2021) Delving into high quality action recognition for low resolution videos. Technical report, DeepBlue technology (Shanghai) Co., Ltd . https:\/\/www.crcv.ucf.edu\/tiny-actions-challenge-cvpr2021\/submissions\/DeepBlueAI_Report.pdf"},{"key":"20827_CR13","unstructured":"Teng W, Geng T, Wang J, Zheng F (2021) Submission to tinyaction challenge 2021. Technical report, SUSTech &HKU . https:\/\/www.crcv.ucf.edu\/tiny-actions-challenge-cvpr2021\/submissions\/SUSTech&HKU_Report.pdf"},{"key":"20827_CR14","doi-asserted-by":"crossref","unstructured":"Arnab A, Dehghani M, Heigold G, Sun C, Lu\u010di\u0107 M, Schmid C (2021) Vivit: A video vision transformer. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 6836\u20136846","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"20827_CR15","doi-asserted-by":"crossref","unstructured":"Li Y, Wu CY, Fan H, Mangalam K, Xiong B, Malik J, Feichtenhofer C (2022) Mvitv2: Improved multiscale vision transformers for classification and detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 4804\u20134814","DOI":"10.1109\/CVPR52688.2022.00476"},{"key":"20827_CR16","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S et al (2021) An image is worth 16x16 words: Transformers for image recognition at scale. In international conference on learning representations, ICLR 2021, Virtual Event, Austria, May 3-7."},{"key":"20827_CR17","unstructured":"Tirupattur P, Rana AJ, Sangam T, Vyas S, Rawat YS, Shah M (2021) Tinyaction challenge: Recognizing real-world low-resolution activities in videos. arXiv preprint arXiv:2107.11494"},{"key":"20827_CR18","unstructured":"Simonyan K, Zisserman A (2014) Two-stream convolutional networks for action recognition in videos. Adv Neural Inf Process Syst 27"},{"key":"20827_CR19","doi-asserted-by":"crossref","unstructured":"Sun S, Kuang Z, Sheng L, Ouyang W, Zhang W (2018) Optical flow guided feature: A fast and robust motion representation for video action recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1390\u20131399","DOI":"10.1109\/CVPR.2018.00151"},{"key":"20827_CR20","doi-asserted-by":"crossref","unstructured":"Feichtenhofer C, Pinz A, Zisserman A (2016) Convolutional two-stream network fusion for video action recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1933\u20131941","DOI":"10.1109\/CVPR.2016.213"},{"key":"20827_CR21","doi-asserted-by":"crossref","unstructured":"Yue-Hei\u00a0Ng J, Hausknecht M, Vijayanarasimhan S, Vinyals O, Monga R, Toderici G (2015) Beyond short snippets: Deep networks for video classification. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4694\u20134702","DOI":"10.1109\/CVPR.2015.7299101"},{"key":"20827_CR22","doi-asserted-by":"crossref","unstructured":"Donahue J, Anne\u00a0Hendricks L, Guadarrama S, Rohrbach M, Venugopalan S, Saenko K, Darrell T (2015) Long-term recurrent convolutional networks for visual recognition and description. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 2625\u20132634","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"20827_CR23","unstructured":"Shi X, Chen Z, Wang H, Yeung DY, Wong WK, Woo Wc (2015) Convolutional lstm network: A machine learning approach for precipitation nowcasting. Adv Neural Inf Process Syst 28"},{"issue":"1","key":"20827_CR24","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2012","unstructured":"Ji S, Xu W, Yang M, Yu K (2012) 3d convolutional neural networks for human action recognition. IEEE Trans Pattern Anal Mach Intell 35(1):221\u2013231","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"20827_CR25","doi-asserted-by":"crossref","unstructured":"Taylor GW, Fergus R, LeCun Y, Bregler C (2010) Convolutional learning of spatio-temporal features. In: Computer Vision\u2013ECCV 2010: 11th european conference on computer vision, Heraklion, Crete, Greece, September 5-11, 2010, Proceedings, Part VI 11, Springer, pp 140\u2013153","DOI":"10.1007\/978-3-642-15567-3_11"},{"key":"20827_CR26","doi-asserted-by":"crossref","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M (2015) Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE international conference on computer vision, pp 4489\u20134497","DOI":"10.1109\/ICCV.2015.510"},{"issue":"6","key":"20827_CR27","doi-asserted-by":"publisher","first-page":"1510","DOI":"10.1109\/TPAMI.2017.2712608","volume":"40","author":"G Varol","year":"2017","unstructured":"Varol G, Laptev I, Schmid C (2017) Long-term temporal convolutions for action recognition. IEEE Trans Pattern Anal Mach Intell 40(6):1510\u20131517","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"20827_CR28","doi-asserted-by":"crossref","unstructured":"Tran D, Wang H, Torresani L, Feiszli M (2019) Video classification with channel-separated convolutional networks. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 5552\u20135561","DOI":"10.1109\/ICCV.2019.00565"},{"key":"20827_CR29","doi-asserted-by":"crossref","unstructured":"Wang L, Xiong Y, Wang Z, Qiao Y, Lin D, Tang X, Van\u00a0Gool L (2016) Temporal segment networks: Towards good practices for deep action recognition. In: European conference on computer vision, Springer, pp 20\u201336","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"20827_CR30","doi-asserted-by":"crossref","unstructured":"Zhou B, Andonian A, Oliva A, Torralba A (2018) Temporal relational reasoning in videos. In: Proceedings of the european conference on computer vision (ECCV), pp 803\u2013818","DOI":"10.1007\/978-3-030-01246-5_49"},{"key":"20827_CR31","doi-asserted-by":"crossref","unstructured":"Wang X, Girshick R, Gupta A, He K (2018) Non-local neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7794\u20137803","DOI":"10.1109\/CVPR.2018.00813"},{"key":"20827_CR32","doi-asserted-by":"crossref","unstructured":"Sun C, Shrivastava A, Vondrick C, Sukthankar R, Murphy K, Schmid C (2019) Relational action forecasting. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 273\u2013283","DOI":"10.1109\/CVPR.2019.00036"},{"key":"20827_CR33","doi-asserted-by":"crossref","unstructured":"Sun C, Shrivastava A, Vondrick C, Murphy K, Sukthankar R, Schmid C (2018) Actor-centric relation network. In: Proceedings of the european conference on computer vision (ECCV), pp 318\u2013334","DOI":"10.1007\/978-3-030-01252-6_20"},{"key":"20827_CR34","doi-asserted-by":"crossref","unstructured":"Tang J, Xia J, Mu X, Pang B, Lu C (2020) Asynchronous interaction aggregation for action detection. In: Computer vision\u2013ECCV 2020: 16th european conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XV 16, Springer, pp 71\u201387","DOI":"10.1007\/978-3-030-58555-6_5"},{"key":"20827_CR35","doi-asserted-by":"crossref","unstructured":"Zhang Y, Tokmakov P, Hebert M, Schmid C (2019) A structured model for action detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 9975\u20139984","DOI":"10.1109\/CVPR.2019.01021"},{"key":"20827_CR36","doi-asserted-by":"crossref","unstructured":"Materzynska J, Xiao T, Herzig R, Xu H, Wang X, Darrell T (2020) Something-else: Compositional action recognition with spatial-temporal interaction networks. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 1049\u20131059","DOI":"10.1109\/CVPR42600.2020.00113"},{"key":"20827_CR37","doi-asserted-by":"crossref","unstructured":"Ji J, Krishna R, Fei-Fei L, Niebles JC (2020) Action genome: Actions as compositions of spatio-temporal scene graphs. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10236\u201310247","DOI":"10.1109\/CVPR42600.2020.01025"},{"key":"20827_CR38","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. Adv Neural Inf Process Syst 30"},{"key":"20827_CR39","doi-asserted-by":"crossref","unstructured":"Chen CFR, Fan Q, Panda R (2021) Crossvit: Cross-attention multi-scale vision transformer for image classification. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 357\u2013366","DOI":"10.1109\/ICCV48922.2021.00041"},{"key":"20827_CR40","doi-asserted-by":"crossref","unstructured":"Liu J, Fan X, Huang Z, Wu G, Liu R, Zhong W, Luo Z (2022) Target-aware dual adversarial learning and a multi-scenario multi-modality benchmark to fuse infrared and visible for object detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 5802\u20135811","DOI":"10.1109\/CVPR52688.2022.00571"},{"key":"20827_CR41","doi-asserted-by":"crossref","unstructured":"Xie H, Lee MX, Chen TJ, Chen HJ, Liu HI, Shuai HH, Cheng WH (2023) Most important person-guided dual-branch cross-patch attention for group affect recognition. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 20598\u201320608","DOI":"10.1109\/ICCV51070.2023.01883"},{"key":"20827_CR42","unstructured":"Ba JL, Kiros JR, Hinton GE (2016) Layer normalization. arXiv preprint arXiv:1607.06450"},{"key":"20827_CR43","doi-asserted-by":"crossref","unstructured":"Demir U, Rawat YS, Shah M (2021) Tinyvirat: Low-resolution video action recognition. In: 2020 25th international conference on pattern recognition (ICPR), IEEE, pp 7387\u20137394","DOI":"10.1109\/ICPR48806.2021.9412541"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-025-20827-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-025-20827-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-025-20827-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,2]],"date-time":"2025-10-02T23:08:40Z","timestamp":1759446520000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-025-20827-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,15]]},"references-count":43,"journal-issue":{"issue":"34","published-online":{"date-parts":[[2025,10]]}},"alternative-id":["20827"],"URL":"https:\/\/doi.org\/10.1007\/s11042-025-20827-w","relation":{},"ISSN":["1573-7721"],"issn-type":[{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2025,4,15]]},"assertion":[{"value":"5 June 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 March 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 April 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 April 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing Interests"}},{"value":"Part of the data used in this study comes from publicly available datasets, and the rest of the data involved has been mosaic-ed, and all participants gave their informed consent to participate in this study.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical and Informed Consent for Data Used"}}]}}