{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,28]],"date-time":"2026-03-28T15:59:43Z","timestamp":1774713583722,"version":"3.50.1"},"reference-count":43,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2023,7,20]],"date-time":"2023-07-20T00:00:00Z","timestamp":1689811200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,7,20]],"date-time":"2023-07-20T00:00:00Z","timestamp":1689811200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Process Lett"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1007\/s11063-023-11367-1","type":"journal-article","created":{"date-parts":[[2023,7,20]],"date-time":"2023-07-20T16:02:23Z","timestamp":1689868943000},"page":"11109-11130","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":20,"title":["Swin-Fusion: Swin-Transformer with Feature Fusion for Human Action Recognition"],"prefix":"10.1007","volume":"55","author":[{"given":"Tiansheng","family":"Chen","sequence":"first","affiliation":[]},{"given":"Lingfei","family":"Mo","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,7,20]]},"reference":[{"key":"11367_CR1","doi-asserted-by":"crossref","unstructured":"Lin J, Gan C, Han S (2019) Tsm: Temporal shift module for efficient video understanding. In: Proceedings of the IEEE\/CVF international conference on computer vision, 7083\u20137093","DOI":"10.1109\/ICCV.2019.00718"},{"key":"11367_CR2","unstructured":"Li K, Wang Y, Gao P, Song G, Liu Y, Li H, Qiao Y (2022) Uniformer: Unified transformer for efficient spatiotemporal representation learning. arXiv preprint arXiv:2201.04676"},{"key":"11367_CR3","doi-asserted-by":"crossref","unstructured":"Girdhar R, Singh M, Ravi N, van der Maaten L, Joulin A, Misra I (2022) Omnivore: A single model for many visual modalities. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 16102\u201316112","DOI":"10.1109\/CVPR52688.2022.01563"},{"issue":"5","key":"11367_CR4","doi-asserted-by":"publisher","first-page":"3117","DOI":"10.1002\/int.22814","volume":"37","author":"J Zhang","year":"2022","unstructured":"Zhang J, Yang J, Yu J, Fan J (2022) Semisupervised image classification by mutual learning of multiple self-supervised models. Int J Intell Syst 37(5):3117\u20133141","journal-title":"Int J Intell Syst"},{"key":"11367_CR5","doi-asserted-by":"publisher","first-page":"475","DOI":"10.1016\/j.neucom.2017.06.041","volume":"267","author":"T Qi","year":"2017","unstructured":"Qi T, Xu Y, Quan Y, Wang Y, Ling H (2017) Image-based action recognition using hint-enhanced deep neural networks. Neurocomputing 267:475\u2013488","journal-title":"Neurocomputing"},{"key":"11367_CR6","doi-asserted-by":"crossref","unstructured":"Lavinia Y, Vo HH, Verma A (2016) Fusion based deep cnn for improved large-scale image action recognition. In: 2016 IEEE international symposium on multimedia (ISM), 609\u2013614. IEEE","DOI":"10.1109\/ISM.2016.0131"},{"key":"11367_CR7","doi-asserted-by":"publisher","first-page":"47051","DOI":"10.1109\/ACCESS.2022.3171263","volume":"10","author":"K Hirooka","year":"2022","unstructured":"Hirooka K, Hasan MAM, Shin J, Srizon AY (2022) Ensembled transfer learning based multichannel attention networks for human activity recognition in still images. IEEE Access 10:47051\u201347062","journal-title":"IEEE Access"},{"key":"11367_CR8","doi-asserted-by":"crossref","unstructured":"Mohammadi S, Majelan SG, Shokouhi SB (2019) Ensembles of deep neural networks for action recognition in still images. In: 2019 9th international conference on computer and knowledge engineering (ICCKE), 315\u2013318. IEEE","DOI":"10.1109\/ICCKE48569.2019.8965014"},{"key":"11367_CR9","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2022.104568","volume":"127","author":"Z Chong","year":"2022","unstructured":"Chong Z, Mo L (2022) St-vton: self-supervised vision transformer for image-based virtual try-on. Image Vis Comput 127:104568","journal-title":"Image Vis Comput"},{"issue":"12","key":"11367_CR10","doi-asserted-by":"publisher","first-page":"4467","DOI":"10.1109\/TCSVT.2019.2947482","volume":"30","author":"J Yu","year":"2019","unstructured":"Yu J, Li J, Yu Z, Huang Q (2019) Multimodal transformer with multi-view visual representation for image captioning. IEEE Trans Circuits Syst Video Technol 30(12):4467\u20134480","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"11367_CR11","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.107952","volume":"116","author":"J Zhang","year":"2021","unstructured":"Zhang J, Cao Y, Wu Q (2021) Vector of locally and adaptively aggregated descriptors for image feature representation. Pattern Recogn 116:107952","journal-title":"Pattern Recogn"},{"key":"11367_CR12","unstructured":"Csurka G, Dance C, Fan L, Willamowski J, Bray C (2004) Visual categorization with bags of keypoints. In: Workshop on statistical learning in computer vision, ECCV, vol 1, 1\u20132. Prague"},{"key":"11367_CR13","doi-asserted-by":"crossref","unstructured":"Ikizler N, Cinbis RG, Pehlivan S, Duygulu P (2008) Recognizing actions from still images. In: 2008 19th international conference on pattern recognition, pp 1\u20134. IEEE","DOI":"10.1109\/ICPR.2008.4761663"},{"key":"11367_CR14","doi-asserted-by":"crossref","unstructured":"Yao B, Khosla A, Fei-Fei L (2011) Combining randomization and discrimination for fine-grained image categorization. In: CVPR 2011, pp 1577\u20131584. IEEE","DOI":"10.1109\/CVPR.2011.5995368"},{"key":"11367_CR15","doi-asserted-by":"crossref","unstructured":"Yu X, Zhang Z, Wu L, Pang W, Chen H, Yu Z, Li B (2020) Deep ensemble learning for human action recognition in still images. Complexity 2020","DOI":"10.1155\/2020\/9428612"},{"key":"11367_CR16","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"11367_CR17","doi-asserted-by":"crossref","unstructured":"Szegedy C, Liu W, Jia Y, Sermanet P, Reed S, Anguelov D, Erhan D, Vanhoucke V, Rabinovich A (2015) Going deeper with convolutions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 1\u20139","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"11367_CR18","doi-asserted-by":"publisher","first-page":"563","DOI":"10.1016\/j.procs.2018.10.432","volume":"143","author":"S Sreela","year":"2018","unstructured":"Sreela S, Idicula SM (2018) Action recognition in still images using residual neural network features. Procedia Comput. Sci. 143:563\u2013569","journal-title":"Procedia Comput. Sci."},{"key":"11367_CR19","doi-asserted-by":"crossref","unstructured":"Gkioxari G, Girshick R, Malik J (2015) Contextual action recognition with r* cnn. In: Proceedings of the IEEE international conference on computer vision, 1080\u20131088","DOI":"10.1109\/ICCV.2015.129"},{"issue":"11","key":"11367_CR20","doi-asserted-by":"publisher","first-page":"5479","DOI":"10.1109\/TIP.2016.2605305","volume":"25","author":"Y Zhang","year":"2016","unstructured":"Zhang Y, Cheng L, Wu J, Cai J, Do MN, Lu J (2016) Action recognition in still images with minimum annotation efforts. IEEE Trans Image Process 25(11):5479\u20135490","journal-title":"IEEE Trans Image Process"},{"key":"11367_CR21","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, et al (2020) An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929"},{"key":"11367_CR22","unstructured":"Touvron H, Cord M, Douze M, Massa F, Sablayrolles A, J\u00e9gou H (2021) Training data-efficient image transformers and distillation through attention. In: International conference on machine learning, 10347\u201310357. PMLR"},{"key":"11367_CR23","doi-asserted-by":"crossref","unstructured":"Yu W, Luo M, Zhou P, Si C, Zhou Y, Wang X, Feng J, Yan S (2022) Metaformer is actually what you need for vision. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 10819\u201310829","DOI":"10.1109\/CVPR52688.2022.01055"},{"key":"11367_CR24","unstructured":"Li Y, Yuan G, Wen Y, Hu E, Evangelidis G, Tulyakov S, Wang Y, Ren J (2022) Efficientformer: vision transformers at mobilenet speed. arXiv preprint arXiv:2206.01191"},{"key":"11367_CR25","doi-asserted-by":"crossref","unstructured":"Liu Z, Lin Y, Cao Y, Hu H, Wei Y, Zhang Z, Lin S, Guo B (2021) Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF international conference on computer vision, 10012\u201310022","DOI":"10.1109\/ICCV48922.2021.00986"},{"issue":"2","key":"11367_CR26","doi-asserted-by":"publisher","first-page":"217","DOI":"10.1007\/s11263-011-0505-4","volume":"98","author":"J Cruz-Mota","year":"2012","unstructured":"Cruz-Mota J, Bogdanova I, Paquier B, Bierlaire M, Thiran J-P (2012) Scale invariant feature transform on the sphere: theory and applications. Int J Comput Vis. 98(2):217\u2013241","journal-title":"Int J Comput Vis."},{"key":"11367_CR27","doi-asserted-by":"crossref","unstructured":"Dalal N, Triggs B (2005) Histograms of oriented gradients for human detection. In: 2005 IEEE computer society conference on computer vision and pattern recognition (CVPR\u201905), vol 1, 886\u2013893. IEEE","DOI":"10.1109\/CVPR.2005.177"},{"key":"11367_CR28","doi-asserted-by":"crossref","unstructured":"Long J, Shelhamer E, Darrell T (2015) Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 3431\u20133440","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"11367_CR29","doi-asserted-by":"crossref","unstructured":"Hariharan B, Arbel\u00e1ez P, Girshick R, Malik J (2015) Hypercolumns for object segmentation and fine-grained localization. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 447\u2013456","DOI":"10.1109\/CVPR.2015.7298642"},{"key":"11367_CR30","doi-asserted-by":"crossref","unstructured":"Ghiasi G, Fowlkes CC (2016) Laplacian pyramid reconstruction and refinement for semantic segmentation. In: European conference on computer vision, 519\u2013534. Springer","DOI":"10.1007\/978-3-319-46487-9_32"},{"key":"11367_CR31","doi-asserted-by":"crossref","unstructured":"Lin T-Y, Doll\u00e1r P, Girshick R, He K, Hariharan B, Belongie S (2017) Feature pyramid networks for object detection. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2117\u20132125","DOI":"10.1109\/CVPR.2017.106"},{"key":"11367_CR32","doi-asserted-by":"crossref","unstructured":"Li Z, Ge Y, Feng J, Qin X, Yu J, Yu H (2020) Deep selective feature learning for action recognition. In: 2020 IEEE international conference on multimedia and expo (ICME), 1\u20136. IEEE","DOI":"10.1109\/ICME46284.2020.9102727"},{"key":"11367_CR33","doi-asserted-by":"publisher","first-page":"61386","DOI":"10.1109\/ACCESS.2018.2872798","volume":"6","author":"R Li","year":"2018","unstructured":"Li R, Liu Z, Tan J (2018) Reassessing hierarchical representation for action recognition in still images. IEEE Access 6:61386\u201361400","journal-title":"IEEE Access"},{"key":"11367_CR34","doi-asserted-by":"publisher","first-page":"3691","DOI":"10.1109\/TIP.2021.3064256","volume":"30","author":"A Bera","year":"2021","unstructured":"Bera A, Wharton Z, Liu Y, Bessis N, Behera A (2021) Attend and guide (ag-net): a keypoints-driven attention-based deep network for image recognition. IEEE Trans Image Process 30:3691\u20133704","journal-title":"IEEE Trans Image Process"},{"key":"11367_CR35","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2020.3031841","author":"A Behera","year":"2020","unstructured":"Behera A, Wharton Z, Liu Y, Ghahremani M, Kumar S, Bessis N (2020) Regional attention network (ran) for head pose and fine-grained gesture recognition. IEEE Trans Affect Comput. https:\/\/doi.org\/10.1109\/TAFFC.2020.3031841","journal-title":"IEEE Trans Affect Comput"},{"key":"11367_CR36","doi-asserted-by":"publisher","DOI":"10.1155\/2019\/4125865","author":"HM Eraqi","year":"2019","unstructured":"Eraqi HM, Abouelnaga Y, Saad MH, Moustafa MN (2019) Driver distraction identification with an ensemble of convolutional neural networks. J Adv Transp. https:\/\/doi.org\/10.1155\/2019\/4125865","journal-title":"J Adv Transp"},{"key":"11367_CR37","doi-asserted-by":"crossref","unstructured":"Wharton Z, Behera A, Liu Y, Bessis N (2021) Coarse temporal attention network (cta-net) for driver\u2019s activity recognition. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, 1279\u20131289","DOI":"10.1109\/WACV48630.2021.00132"},{"issue":"3","key":"11367_CR38","doi-asserted-by":"publisher","first-page":"617","DOI":"10.1007\/s11760-019-01589-z","volume":"14","author":"M Alotaibi","year":"2020","unstructured":"Alotaibi M, Alotaibi B (2020) Distracted driver classification using deep learning. SIViP 14(3):617\u2013624","journal-title":"SIViP"},{"key":"11367_CR39","doi-asserted-by":"crossref","unstructured":"Arefin MR, Makhmudkhujaev F, Chae O, Kim J (2019) Aggregating cnn and hog features for real-time distracted driver detection. In: 2019 IEEE international conference on consumer electronics (ICCE), 1\u20133. IEEE","DOI":"10.1109\/ICCE.2019.8661970"},{"key":"11367_CR40","doi-asserted-by":"crossref","unstructured":"Behera A, Keidel AH (2018) Latent body-pose guided densenet for recognizing driver\u2019s fine-grained secondary activities. In: 2018 15th IEEE international conference on advanced video and signal based surveillance (AVSS), 1\u20136. IEEE","DOI":"10.1109\/AVSS.2018.8639158"},{"key":"11367_CR41","doi-asserted-by":"crossref","unstructured":"Wu M, Zhang X, Shen L, Yu H (2021) Pose-aware multi-feature fusion network for driver distraction recognition. In: 2020 25th international conference on pattern recognition (ICPR), 1228\u20131235. IEEE","DOI":"10.1109\/ICPR48806.2021.9413337"},{"key":"11367_CR42","doi-asserted-by":"crossref","unstructured":"Mase JM, Chapman P, Figueredo GP, Torres MT (2020) A hybrid deep learning approach for driver distraction detection. In: 2020 international conference on information and communication technology convergence (ICTC), 1\u20136. IEEE","DOI":"10.1109\/ICTC49870.2020.9289588"},{"key":"11367_CR43","doi-asserted-by":"crossref","unstructured":"Selvaraju RR, Cogswell M, Das A, Vedantam R, Parikh D, Batra D (2017) Grad-cam: Visual explanations from deep networks via gradient-based localization. In: Proceedings of the IEEE international conference on computer vision, 618\u2013626","DOI":"10.1109\/ICCV.2017.74"}],"container-title":["Neural Processing Letters"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-023-11367-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11063-023-11367-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-023-11367-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,24]],"date-time":"2024-10-24T18:45:24Z","timestamp":1729795524000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11063-023-11367-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,7,20]]},"references-count":43,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2023,12]]}},"alternative-id":["11367"],"URL":"https:\/\/doi.org\/10.1007\/s11063-023-11367-1","relation":{},"ISSN":["1370-4621","1573-773X"],"issn-type":[{"value":"1370-4621","type":"print"},{"value":"1573-773X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,7,20]]},"assertion":[{"value":"10 July 2023","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 July 2023","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}