{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T14:21:29Z","timestamp":1775744489912,"version":"3.50.1"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s00371-026-04441-x","type":"journal-article","created":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T13:40:50Z","timestamp":1774618850000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Multimodal fusion for enhanced hand\u2013object interaction recognition"],"prefix":"10.1007","volume":"42","author":[{"given":"Jie","family":"Zhang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lixin","family":"Ma","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hao","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaoheng","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chunsheng","family":"Hua","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,3,27]]},"reference":[{"issue":"9","key":"4441_CR1","doi-asserted-by":"publisher","first-page":"6449","DOI":"10.1007\/s00371-023-03175-4","volume":"40","author":"AK Dash","year":"2024","unstructured":"Dash, A.K., Balaji, K.V., Dogra, D.P., et al.: Interactions with 3d virtual objects in augmented reality using natural gestures. Vis. Comput. 40(9), 6449\u20136462 (2024)","journal-title":"Vis. Comput."},{"issue":"4","key":"4441_CR2","doi-asserted-by":"publisher","first-page":"87","DOI":"10.1145\/3386569.3392452","volume":"39","author":"S Han","year":"2020","unstructured":"Han, S., Liu, B., Cabezas, R., et al.: Megatrack: monochrome egocentric articulated hand-tracking for virtual reality. ACM Transactions on Graphics (ToG) 39(4), 87\u20131 (2020)","journal-title":"ACM Transactions on Graphics (ToG)"},{"key":"4441_CR3","doi-asserted-by":"crossref","unstructured":"Huang, N., Goswami, P., Sundstedt, V. et\u00a0al.: Personalized smart immersive xr environments: a systematic literature review. The Visual Computer pp 1\u201334 (2025)","DOI":"10.1007\/s00371-025-03887-9"},{"key":"4441_CR4","first-page":"328","volume":"45","author":"A Karambakhsh","year":"2019","unstructured":"Karambakhsh, A., Kamel, A., Sheng, B., et al.: Deep gesture interaction for augmented anatomy learning. Int. J. Inf. Manage. 45, 328\u2013336 (2019)","journal-title":"Int. J. Inf. Manage."},{"key":"4441_CR5","doi-asserted-by":"publisher","DOI":"10.3389\/frobt.2021.714023","volume":"8","author":"A Carf\u00ec","year":"2021","unstructured":"Carf\u00ec, A., Patten, T., Kuang, Y., et al.: Hand-object interaction: from human demonstrations to robot manipulation. Front. Robot. AI 8, 714023 (2021)","journal-title":"Front. Robot. AI"},{"issue":"1","key":"4441_CR6","doi-asserted-by":"publisher","first-page":"297","DOI":"10.1146\/annurev-control-100819-063206","volume":"3","author":"H Ravichandar","year":"2020","unstructured":"Ravichandar, H., Polydoros, A.S., Chernova, S., et al.: Recent advances in robot learning from demonstration. Annual Rev. Control Robot. Autonomous Sys. 3(1), 297\u2013330 (2020)","journal-title":"Annual Rev. Control Robot. Autonomous Sys."},{"key":"4441_CR7","unstructured":"Ding, J., Perzylo, A., Zhou, L.: A knowledge-augmented concept for programming by demonstration based on hand-object actions. In: International workshop on working towards ontology-based standards for robotics and automation (WOSRA), ICRA (2023)"},{"key":"4441_CR8","doi-asserted-by":"publisher","first-page":"164","DOI":"10.1016\/j.promfg.2017.07.221","volume":"11","author":"M Haage","year":"2017","unstructured":"Haage, M., Piperagkas, G., Papadopoulos, C., et al.: Teaching assembly by demonstration using advanced human robot interaction and a knowledge integration framework. Procedia Manufact. 11, 164\u2013173 (2017)","journal-title":"Procedia Manufact."},{"issue":"4","key":"4441_CR9","doi-asserted-by":"publisher","first-page":"1224","DOI":"10.3390\/s25041224","volume":"25","author":"S Muksimova","year":"2025","unstructured":"Muksimova, S., Valikhujaev, Y., Umirzakova, S., et al.: Gazecapsnet: a lightweight gaze estimation framework. Sensors 25(4), 1224 (2025)","journal-title":"Sensors"},{"key":"4441_CR10","unstructured":"Kapitanov, A., Kvanchiani, K., Nagaev, A. et\u00a0al.: Hagrid\u2013hand gesture recognition image dataset. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp 4572\u20134581 (2024)"},{"issue":"20","key":"4441_CR11","doi-asserted-by":"publisher","first-page":"31309","DOI":"10.1007\/s11042-023-14732-3","volume":"82","author":"Y Zhang","year":"2023","unstructured":"Zhang, Y., Wang, J., Wang, X., et al.: Static hand gesture recognition method based on the vision transformer. Multimedia Tools Appl. 82(20), 31309\u201331328 (2023)","journal-title":"Multimedia Tools Appl."},{"key":"4441_CR12","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., et\u00a0al.: Slowfast networks for video recognition. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 6202\u20136211 (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"4441_CR13","doi-asserted-by":"crossref","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A.: Anticipating visual representations from unlabeled video. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 98\u2013106 (2016)","DOI":"10.1109\/CVPR.2016.18"},{"key":"4441_CR14","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Grauman, K.: Anticipative video transformer. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 13505\u201313515 (2021)","DOI":"10.1109\/ICCV48922.2021.01325"},{"key":"4441_CR15","doi-asserted-by":"crossref","unstructured":"Neimark, D., Bar, O., Zohar, M., et\u00a0al.: Video transformer network. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 3163\u20133172 (2021)","DOI":"10.1109\/ICCVW54120.2021.00355"},{"key":"4441_CR16","doi-asserted-by":"crossref","unstructured":"Ji, J., Krishna, R., Fei-Fei, L., et al.: Action genome: Actions as compositions of spatio-temporal scene graphs. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10236\u201310247 (2020)","DOI":"10.1109\/CVPR42600.2020.01025"},{"key":"4441_CR17","doi-asserted-by":"crossref","unstructured":"Yan, S., Xiong, Y., Lin, D.: Spatial temporal graph convolutional networks for skeleton-based action recognition. In: Proceedings of the AAAI conference on artificial intelligence (2018)","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"4441_CR18","doi-asserted-by":"publisher","first-page":"811","DOI":"10.1109\/TMM.2023.3271811","volume":"26","author":"J Liu","year":"2023","unstructured":"Liu, J., Wang, X., Wang, C., et al.: Temporal decoupling graph convolutional network for skeleton-based gesture recognition. IEEE Trans. Multimedia 26, 811\u2013823 (2023)","journal-title":"IEEE Trans. Multimedia"},{"issue":"4","key":"4441_CR19","doi-asserted-by":"publisher","first-page":"1586","DOI":"10.1109\/TIP.2017.2785279","volume":"27","author":"J Liu","year":"2017","unstructured":"Liu, J., Wang, G., Duan, L.Y., et al.: Skeleton-based human action recognition with global context-aware attention lstm networks. IEEE Trans. Image Process. 27(4), 1586\u20131599 (2017)","journal-title":"IEEE Trans. Image Process."},{"key":"4441_CR20","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2021.103219","volume":"208","author":"C Plizzari","year":"2021","unstructured":"Plizzari, C., Cannici, M., Matteucci, M.: Skeleton-based action recognition via spatial and temporal transformer networks. Comput. Vis. Image Underst. 208, 103219 (2021)","journal-title":"Comput. Vis. Image Underst."},{"key":"4441_CR21","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., et\u00a0al.: Attention is all you need. Advances in neural information processing systems 30 (2017)"},{"key":"4441_CR22","first-page":"8777355","volume":"1","author":"A Alnuaim","year":"2022","unstructured":"Alnuaim, A., Zakariah, M., Hatamleh, W.A., et al.: Human-computer interaction with hand gesture recognition using resnet and mobilenet. Comput. Intell. Neurosci. 1, 8777355 (2022)","journal-title":"Comput. Intell. Neurosci."},{"key":"4441_CR23","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1109\/TMM.2021.3120873","volume":"25","author":"X Lin","year":"2021","unstructured":"Lin, X., Sun, S., Huang, W., et al.: Eapt: efficient attention pyramid transformer for image processing. IEEE Trans. Multimedia 25, 50\u201361 (2021)","journal-title":"IEEE Trans. Multimedia"},{"key":"4441_CR24","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. Advances in neural information processing systems 27 (2014)"},{"key":"4441_CR25","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., et al.: Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE international conference on computer vision, pp 4489\u20134497 (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"4441_CR26","doi-asserted-by":"crossref","unstructured":"Duan, H., Zhao, Y., Chen, K., et\u00a0al.: Revisiting skeleton-based action recognition. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 2969\u20132978 (2022)","DOI":"10.1109\/CVPR52688.2022.00298"},{"key":"4441_CR27","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? In: ICML, p\u00a04 (2021)"},{"issue":"4","key":"4441_CR28","doi-asserted-by":"publisher","first-page":"2179","DOI":"10.1109\/TCDS.2023.3242988","volume":"15","author":"SH Peng","year":"2023","unstructured":"Peng, S.H., Tsai, P.H.: An efficient graph convolution network for skeleton-based dynamic hand gesture recognition. IEEE Trans Cognit Develop Sys 15(4), 2179\u20132189 (2023)","journal-title":"IEEE Trans Cognit Develop Sys"},{"issue":"1","key":"4441_CR29","doi-asserted-by":"publisher","DOI":"10.1002\/cav.2207","volume":"35","author":"U Aiman","year":"2024","unstructured":"Aiman, U., Ahmad, T.: Angle based hand gesture recognition using graph convolutional network. Comput. Animat. Virtual Worlds 35(1), e2207 (2024)","journal-title":"Comput. Animat. Virtual Worlds"},{"issue":"4","key":"4441_CR30","doi-asserted-by":"publisher","first-page":"7823","DOI":"10.1109\/LRA.2021.3101822","volume":"6","author":"A Sabater","year":"2021","unstructured":"Sabater, A., Alonso, I., Montesano, L., et al.: Domain and view-point agnostic hand action recognition. IEEE Robot. Autom. Lett. 6(4), 7823\u20137830 (2021)","journal-title":"IEEE Robot. Autom. Lett."},{"issue":"1","key":"4441_CR31","doi-asserted-by":"publisher","first-page":"11","DOI":"10.1007\/s00371-022-02762-1","volume":"40","author":"H Mahmud","year":"2024","unstructured":"Mahmud, H., Morshed, M.M., Hasan, M.K.: Quantized depth image and skeleton-based multimodal dynamic hand gesture recognition. Vis. Comput. 40(1), 11\u201325 (2024)","journal-title":"Vis. Comput."},{"key":"4441_CR32","unstructured":"De Smedt, Q., Wannous, H., Vandeborre, J.P., et al.: Shrec\u201917 track: 3d hand gesture recognition using a depth and skeletal dataset. In: 3DOR-10th Eurographics Workshop on 3D Object Retrieval, pp 1\u20136 (2017)"},{"key":"4441_CR33","doi-asserted-by":"crossref","unstructured":"Boulahia, S.Y., Anquetil, E., Multon, F., et al.: Dynamic hand gesture recognition based on 3d pattern assembled trajectories. In: 2017 seventh international conference on image processing theory, tools and applications (IPTA), IEEE, pp 1\u20136 (2017)","DOI":"10.1109\/IPTA.2017.8310146"},{"key":"4441_CR34","doi-asserted-by":"crossref","unstructured":"Wen, Y., Tang, Z., Pang, Y., et\u00a0al.: Interactive spatiotemporal token attention network for skeleton-based general interactive action recognition. In: 2023 IEEE\/RSJ international conference on intelligent robots and systems (IROS), IEEE, pp 7886\u20137892 (2023)","DOI":"10.1109\/IROS55552.2023.10342472"},{"key":"4441_CR35","doi-asserted-by":"crossref","unstructured":"Shamil, M.S., Chatterjee, D., Sener, F., et al.: On the utility of 3d hand poses for action recognition. In: European conference on computer vision, Springer, pp 436\u2013454 (2024)","DOI":"10.1007\/978-3-031-72658-3_25"},{"key":"4441_CR36","doi-asserted-by":"crossref","unstructured":"Tekin, B., Bogo, F., Pollefeys, M.: H+ o: Unified egocentric recognition of 3d hand-object poses and interactions. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 4511\u20134520 (2019)","DOI":"10.1109\/CVPR.2019.00464"},{"key":"4441_CR37","doi-asserted-by":"crossref","unstructured":"Wen, Y., Pan, H., Yang, L., et\u00a0al.: Hierarchical temporal transformer for 3d hand pose estimation and action recognition from egocentric rgb videos. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 21243\u201321253 (2023)","DOI":"10.1109\/CVPR52729.2023.02035"},{"key":"4441_CR38","doi-asserted-by":"crossref","unstructured":"Kwon, T., Tekin, B., St\u00fchmer, J., et al.: H2o: Two hands manipulating objects for first person interaction recognition. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 10138\u201310148 (2021)","DOI":"10.1109\/ICCV48922.2021.00998"},{"key":"4441_CR39","doi-asserted-by":"crossref","unstructured":"Garcia-Hernando, G., Yuan, S., Baek, S., et al.: First-person hand action benchmark with rgb-d videos and 3d hand pose annotations. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 409\u2013419 (2018)","DOI":"10.1109\/CVPR.2018.00050"},{"key":"4441_CR40","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., et al.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"4441_CR41","doi-asserted-by":"crossref","unstructured":"Zimmermann, C., Ceylan, D., Yang, J., et al.: Freihand: A dataset for markerless capture of hand pose and shape from single rgb images. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 813\u2013822 (2019)","DOI":"10.1109\/ICCV.2019.00090"},{"key":"4441_CR42","unstructured":"Lugaresi, C., Tang, J., Nash, H., et al.: Mediapipe: A framework for building perception pipelines. (2019) arXiv:1906.08172"},{"key":"4441_CR43","doi-asserted-by":"crossref","unstructured":"Wang, C., Xu, D., Zhu, Y., et\u00a0al.: Densefusion: 6d object pose estimation by iterative dense fusion. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 3343\u20133352 (2019)","DOI":"10.1109\/CVPR.2019.00346"},{"key":"4441_CR44","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? a new model and the kinetics dataset. In: proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"4441_CR45","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2025.111343","volume":"162","author":"J Liu","year":"2025","unstructured":"Liu, J., Wang, Y., Xiang, S., et al.: Han: An efficient hierarchical self-attention network for skeleton-based gesture recognition. Pattern Recogn. 162, 111343 (2025)","journal-title":"Pattern Recogn."},{"key":"4441_CR46","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.120735","volume":"232","author":"S Narayan","year":"2023","unstructured":"Narayan, S., Mazumdar, A.P., Vipparthi, S.K.: Sbi-dhgr: Skeleton-based intelligent dynamic hand gestures recognition. Expert Syst. Appl. 232, 120735 (2023)","journal-title":"Expert Syst. Appl."},{"issue":"5","key":"4441_CR47","doi-asserted-by":"publisher","first-page":"2208","DOI":"10.1109\/TNNLS.2020.3044176","volume":"33","author":"R Wang","year":"2021","unstructured":"Wang, R., Wu, X.J., Kittler, J.: Symnet: a simple symmetric positive definite manifold deep learning method for image set classification. IEEE Trans. Neural Netw. Learn. Sys. 33(5), 2208\u20132222 (2021)","journal-title":"IEEE Trans. Neural Netw. Learn. Sys."},{"issue":"6","key":"4441_CR48","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1007\/s00138-022-01328-4","volume":"33","author":"R Li","year":"2022","unstructured":"Li, R., Wang, H.: Graph convolutional networks and LSTM for first-person multimodal hand action recognition. Mach. Vis. Appl. 33(6), 84 (2022)","journal-title":"Mach. Vis. Appl."},{"key":"4441_CR49","doi-asserted-by":"crossref","unstructured":"Cho, H., Kim, C., Kim, J., et\u00a0al.: Transformer-based unified recognition of two hands manipulating objects. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 4769\u20134778 (2023)","DOI":"10.1109\/CVPR52729.2023.00462"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-026-04441-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-026-04441-x","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-026-04441-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T13:39:08Z","timestamp":1775741948000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-026-04441-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3]]},"references-count":49,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["4441"],"URL":"https:\/\/doi.org\/10.1007\/s00371-026-04441-x","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,3]]},"assertion":[{"value":"27 November 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 March 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 March 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"228"}}