{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T19:05:05Z","timestamp":1761419105541,"version":"build-2065373602"},"reference-count":40,"publisher":"Springer Science and Business Media LLC","issue":"14","license":[{"start":{"date-parts":[[2025,9,22]],"date-time":"2025-09-22T00:00:00Z","timestamp":1758499200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,22]],"date-time":"2025-09-22T00:00:00Z","timestamp":1758499200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/100022963","name":"Key Research and Development Program of Zhejiang Province","doi-asserted-by":"publisher","award":["No. 2024C03092"],"award-info":[{"award-number":["No. 2024C03092"]}],"id":[{"id":"10.13039\/100022963","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Natural Science Foundation of China under Grant","award":["No. 62371172"],"award-info":[{"award-number":["No. 62371172"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SIViP"],"published-print":{"date-parts":[[2025,12]]},"DOI":"10.1007\/s11760-025-04799-w","type":"journal-article","created":{"date-parts":[[2025,9,22]],"date-time":"2025-09-22T12:46:43Z","timestamp":1758545203000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["3D hand pose estimation based on lightweight CNN and separable self-attention vision transformer"],"prefix":"10.1007","volume":"19","author":[{"given":"Tianpei","family":"Jin","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qingshan","family":"She","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qixiang","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qiang","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaofei","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,9,22]]},"reference":[{"key":"4799_CR1","doi-asserted-by":"crossref","unstructured":"Guleryuz, O.\u00a0G., Kaeser-Chen, C.: Fast lifting for 3d hand pose estimation in ar\/vr applications. In 2018 25th IEEE International Conference on Image Processing (ICIP), pages 106\u2013110. IEEE, (2018)","DOI":"10.1109\/ICIP.2018.8451559"},{"key":"4799_CR2","doi-asserted-by":"publisher","first-page":"191","DOI":"10.1016\/j.cag.2021.04.017","volume":"97","author":"Y Li","year":"2021","unstructured":"Li, Y., Ma, D., Yu, Y., Wei, G., Zhou, Y.: Compact joints encoding for skeleton-based dynamic hand gesture recognition. Computers & Graphics 97, 191\u2013199 (2021)","journal-title":"Computers & Graphics"},{"issue":"17","key":"4799_CR3","doi-asserted-by":"publisher","first-page":"12349","DOI":"10.1007\/s11042-019-08587-w","volume":"79","author":"Q She","year":"2020","unstructured":"She, Q., Mu, G., Gan, H., Fan, Y.: Spatio-temporal sru with global context-aware attention for 3d human action recognition. Multimedia Tools and Applications 79(17), 12349\u201312371 (2020)","journal-title":"Multimedia Tools and Applications"},{"issue":"3","key":"4799_CR4","doi-asserted-by":"publisher","first-page":"207","DOI":"10.1016\/j.vrih.2021.05.002","volume":"3","author":"L Huang","year":"2021","unstructured":"Huang, L., Zhang, B., Guo, Z., Xiao, Y., Cao, Z., Yuan, J.: Survey on depth and rgb image-based 3d hand shape and pose estimation. Virtual Reality & Intelligent Hardware 3(3), 207\u2013234 (2021)","journal-title":"Virtual Reality & Intelligent Hardware"},{"key":"4799_CR5","doi-asserted-by":"crossref","unstructured":"Iqbal, U., Molchanov, P., Gall, T.\u00a0B.\u00a0J.: et\u00a0al. Hand pose estimation via latent 2.5 d heatmap regression. In Proceedings of the European conference on computer vision (ECCV), pages 118\u2013134, (2018)","DOI":"10.1007\/978-3-030-01252-6_8"},{"key":"4799_CR6","doi-asserted-by":"crossref","unstructured":"Ge, L., Ren, Z., Li, Y., Xue, Z., Wang, Y., Cai, J., Yuan, J.: 3d hand shape and pose estimation from a single rgb image. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pages 10833\u201310842, (2019)","DOI":"10.1109\/CVPR.2019.01109"},{"key":"4799_CR7","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Habermann, M., Xu, W., Habibie, I., Theobalt, C., Xu, F.: Monocular real-time hand shape and motion capture using multi-modal data. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 5346\u20135355, (2020)","DOI":"10.1109\/CVPR42600.2020.00539"},{"issue":"4","key":"4799_CR8","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073596","volume":"36","author":"D Mehta","year":"2017","unstructured":"Mehta, D., Sridhar, S., Sotnychenko, O., Rhodin, H., Shafiei, M., Seidel, H.-P., Xu, W., Casas, D., Theobalt, C.: Vnect: real-time 3d human pose estimation with a single rgb camera. Acm transactions on graphics (tog) 36(4), 1\u201314 (2017)","journal-title":"Acm transactions on graphics (tog)"},{"key":"4799_CR9","unstructured":"Chen, J., Mei, J., Li, X., Lu, Y., Yu, Q., Wei, Q., Luo, X., Xie, Y., Adeli, E., Wang, Y.: et\u00a0al. 3d transunet: Advancing medical image segmentation through vision transformers. arXiv preprint arXiv:2310.07781, (2023)"},{"key":"4799_CR10","doi-asserted-by":"crossref","unstructured":"Gorade, V., Mittal, S., Jha, D., Bagci, U.: Synergynet: Bridging the gap between discrete and continuous representations for precise medical image segmentation. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pages 7768\u20137777, (2024)","DOI":"10.1109\/WACV57701.2024.00759"},{"key":"4799_CR11","doi-asserted-by":"crossref","unstructured":"Oikonomidis, I., Kyriazis, N.: et\u00a0al. Efficient model-based 3d tracking of hand articulations using kinect. In BmVC, volume\u00a01, page\u00a03, (2011)","DOI":"10.5244\/C.25.101"},{"issue":"5","key":"4799_CR12","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2629500","volume":"33","author":"J Tompson","year":"2014","unstructured":"Tompson, J., Stein, M., Lecun, Y., Perlin, K.: Real-time continuous pose recovery of human hands using convolutional networks. ACM Transactions on Graphics (ToG) 33(5), 1\u201310 (2014)","journal-title":"ACM Transactions on Graphics (ToG)"},{"key":"4799_CR13","doi-asserted-by":"crossref","unstructured":"Ge, L., Liang, H., Yuan, J.: et\u00a0al. 3d convolutional neural networks for efficient and robust hand pose estimation from single depth images. In Proceedings of the IEEE conference on computer vision and pattern recognition, pages 1991\u20132000, (2017)","DOI":"10.1109\/CVPR.2017.602"},{"issue":"9","key":"4799_CR14","doi-asserted-by":"publisher","first-page":"4422","DOI":"10.1109\/TIP.2018.2834824","volume":"27","author":"L Ge","year":"2018","unstructured":"Ge, L., Liang, H., Yuan, J., Thalmann, D.: Robust 3d hand pose estimation from single depth images using multi-view cnns. IEEE Trans. Image Process. 27(9), 4422\u20134436 (2018)","journal-title":"IEEE Trans. Image Process."},{"key":"4799_CR15","doi-asserted-by":"crossref","unstructured":"Wu, X., Finnegan, D., O\u2019Neill, E., Yang, Y.-L.: Handmap: Robust hand pose estimation via intermediate dense guidance map supervision. In Proceedings of the European Conference on Computer Vision (ECCV), pages 237\u2013253, (2018)","DOI":"10.1007\/978-3-030-01270-0_15"},{"key":"4799_CR16","doi-asserted-by":"crossref","unstructured":"Xiong, F., Zhang, B., Xiao, Y., Cao, Z., Yu, T., Zhou, J.\u00a0T., Yuan, J.: A2j: Anchor-to-joint regression network for 3d articulated pose estimation from a single depth image. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pages 793\u2013802, (2019)","DOI":"10.1109\/ICCV.2019.00088"},{"key":"4799_CR17","doi-asserted-by":"crossref","unstructured":"Zimmermann, C., Brox, T.: Learning to estimate 3d hand pose from single rgb images. In Proceedings of the IEEE international conference on computer vision, pages 4903\u20134911, (2017)","DOI":"10.1109\/ICCV.2017.525"},{"key":"4799_CR18","doi-asserted-by":"crossref","unstructured":"Mueller, F., Bernard, F., Sotnychenko, O., Mehta, D., Sridhar, S., Casas, D.: et\u00a0al. Ganerated hands for real-time 3d hand tracking from monocular rgb. In Proceedings of the IEEE conference on computer vision and pattern recognition, pages 49\u201359, (2018)","DOI":"10.1109\/CVPR.2018.00013"},{"key":"4799_CR19","doi-asserted-by":"crossref","unstructured":"Cai, Y., Ge, L., Liu, J., Cai, J., Cham, T.-J., Yuan, J., Thalmann, N.\u00a0M.: Exploiting spatial-temporal relationships for 3d pose estimation via graph convolutional networks. In Proceedings of the IEEE\/CVF international conference on computer vision, pages 2272\u20132281, (2019)","DOI":"10.1109\/ICCV.2019.00236"},{"key":"4799_CR20","doi-asserted-by":"crossref","unstructured":"Fan, Z., Liu, J., Wang, Y.: Adaptive computationally efficient network for monocular 3d hand pose estimation. In Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part IV 16, pages 127\u2013144. Springer, (2020)","DOI":"10.1007\/978-3-030-58548-8_8"},{"key":"4799_CR21","doi-asserted-by":"crossref","unstructured":"Zhang, X., Huang, H., Tan, J., Xu, H., Yang, C., Peng, G., Wang, L., Liu, J.: Hand image understanding via deep multi-task learning. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pages 11281\u201311292, (2021)","DOI":"10.1109\/ICCV48922.2021.01109"},{"key":"4799_CR22","doi-asserted-by":"crossref","unstructured":"Mueller, F., Mehta, D., Sotnychenko, O., Sridhar, S., Casas, D., Theobalt, C.: Real-time hand tracking under occlusion from an egocentric rgb-d sensor. In Proceedings of the IEEE international conference on computer vision, pages 1154\u20131163, (2017)","DOI":"10.1109\/ICCV.2017.131"},{"key":"4799_CR23","doi-asserted-by":"crossref","unstructured":"Kazakos, E., Nikou, C., Kakadiaris, I.\u00a0A.: On the fusion of rgb and depth information for hand pose estimation. In 2018 25th IEEE International Conference on Image Processing (ICIP), pages 868\u2013872. IEEE, (2018)","DOI":"10.1109\/ICIP.2018.8451022"},{"key":"4799_CR24","unstructured":"Howard, A.\u00a0G.: Mobilenets: Efficient convolutional neural networks for mobile vision applications. arXiv preprint arXiv:1704.04861, (2017)"},{"key":"4799_CR25","doi-asserted-by":"crossref","unstructured":"Sandler, M., Howard, A., Zhu, M., Zhmoginov, A., Chen, L.-C.: Mobilenetv2: Inverted residuals and linear bottlenecks. In Proceedings of the IEEE conference on computer vision and pattern recognition, pages 4510\u20134520, (2018)","DOI":"10.1109\/CVPR.2018.00474"},{"key":"4799_CR26","doi-asserted-by":"crossref","unstructured":"Zhou, D., Hou, Q., Chen, Y., Feng, J., Yan, S.: Rethinking bottleneck structure for efficient mobile network design. In Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part III 16, pages 680\u2013697. Springer, (2020)","DOI":"10.1007\/978-3-030-58580-8_40"},{"key":"4799_CR27","unstructured":"Mehta, S., Rastegari, M.: Mobilevit: light-weight, general-purpose, and mobile-friendly vision transformer. arXiv preprint arXiv:2110.02178, (2021)"},{"key":"4799_CR28","unstructured":"Mehta, S., Rastegari, M.: Separable self-attention for mobile vision transformers. arXiv preprint arXiv:2206.02680, (2022)"},{"key":"4799_CR29","doi-asserted-by":"crossref","unstructured":"Zhang, J., Jiao, J., Chen, M., Qu, L., Xu, X., Yang, Q.: A hand pose tracking benchmark from stereo matching. In 2017 IEEE International Conference on Image Processing (ICIP), pages 982\u2013986, (2017)","DOI":"10.1109\/ICIP.2017.8296428"},{"key":"4799_CR30","doi-asserted-by":"crossref","unstructured":"Moon, G., Yu, S-I., Wen, H., Shiratori, T., Lee, KM.: Interhand2.6m: A dataset and baseline for 3d interacting hand pose estimation from a single RGB image. CoRR, abs\/2008.09309, (2020)","DOI":"10.1007\/978-3-030-58565-5_33"},{"issue":"1","key":"4799_CR31","first-page":"8432840","volume":"2020","author":"S Dai","year":"2020","unstructured":"Dai, S., Liu, W., Yang, W., Fan, L., Zhang, J.: Cascaded hierarchical cnn for rgb-based 3d hand pose estimation. Math. Probl. Eng. 2020(1), 8432840 (2020)","journal-title":"Math. Probl. Eng."},{"key":"4799_CR32","doi-asserted-by":"crossref","unstructured":"Khaleghi, L., Marshall, J., Etemad, A.: Learning sequential contexts using transformer for 3d hand pose estimation. In 2022 26th International Conference on Pattern Recognition (ICPR), pages 535\u2013541. IEEE, (2022)","DOI":"10.1109\/ICPR56361.2022.9955633"},{"key":"4799_CR33","doi-asserted-by":"crossref","unstructured":"Spurr, A., Song, J., Park, S., Hilliges, O.: Cross-modal deep variational hand pose estimation. In Proceedings of the IEEE conference on computer vision and pattern recognition, pages 89\u201398, (2018)","DOI":"10.1109\/CVPR.2018.00017"},{"key":"4799_CR34","doi-asserted-by":"crossref","unstructured":"Yang, L., Yao, A.: Disentangling latent hands for image synthesis and pose estimation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pages 9877\u20139886, (2019)","DOI":"10.1109\/CVPR.2019.01011"},{"key":"4799_CR35","doi-asserted-by":"crossref","unstructured":"Lin, F., Wilhelm, C.: et\u00a0al. Two-hand global 3d pose estimation using monocular rgb. In Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pages 2373\u20132381, (2021)","DOI":"10.1109\/WACV48630.2021.00242"},{"issue":"2","key":"4799_CR36","doi-asserted-by":"publisher","first-page":"618","DOI":"10.3390\/app10020618","volume":"10","author":"X Wang","year":"2020","unstructured":"Wang, X., Jiang, J., Guo, Y., Kang, L., Wei, Y., Li, D.: Cfam: estimating 3d hand poses from a single rgb image with attention. Appl. Sci. 10(2), 618 (2020)","journal-title":"Appl. Sci."},{"key":"4799_CR37","doi-asserted-by":"crossref","unstructured":"Chen, L., Lin, S-Y., Xie, Y., Lin, Y-Y., Fan, W., Xie, X.: Dggan: Depth-image guided generative adversarial networks for disentangling rgb and depth images in 3d hand pose estimation. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pages 411\u2013419, (2020)","DOI":"10.1109\/WACV45572.2020.9093380"},{"key":"4799_CR38","doi-asserted-by":"crossref","unstructured":"Cai, Y., Ge, L., Cai, J.: et\u00a0al. Weakly-supervised 3d hand pose estimation from monocular rgb images. In Proceedings of the European conference on computer vision (ECCV), pages 666\u2013682, (2018)","DOI":"10.1007\/978-3-030-01231-1_41"},{"key":"4799_CR39","doi-asserted-by":"crossref","unstructured":"Lin, Y., Lin, S., et al.: 3d hand pose estimation algorithm based on cascaded features and graph conyolution. Chin. J. Liq. Cryst. Disp 37, 736\u2013745 (2022)","DOI":"10.37188\/CJLCD.2021-0307"},{"key":"4799_CR40","doi-asserted-by":"crossref","unstructured":"Junaid, HHS., Daneshfar, F., Mohammad, M\u00a0A.: Automatic colorectal cancer detection using machine learning and deep learning based on feature selection in histopathological images. Biomedical Signal Processing and Control, 107:107866, (2025)","DOI":"10.1016\/j.bspc.2025.107866"}],"container-title":["Signal, Image and Video Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-025-04799-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11760-025-04799-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-025-04799-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T18:57:05Z","timestamp":1761418625000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11760-025-04799-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,22]]},"references-count":40,"journal-issue":{"issue":"14","published-print":{"date-parts":[[2025,12]]}},"alternative-id":["4799"],"URL":"https:\/\/doi.org\/10.1007\/s11760-025-04799-w","relation":{},"ISSN":["1863-1703","1863-1711"],"issn-type":[{"type":"print","value":"1863-1703"},{"type":"electronic","value":"1863-1711"}],"subject":[],"published":{"date-parts":[[2025,9,22]]},"assertion":[{"value":"10 March 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 August 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 September 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 September 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"1191"}}