{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,24]],"date-time":"2025-06-24T08:45:56Z","timestamp":1750754756052,"version":"3.37.3"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"29","license":[{"start":{"date-parts":[[2021,6,15]],"date-time":"2021-06-15T00:00:00Z","timestamp":1623715200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,6,15]],"date-time":"2021-06-15T00:00:00Z","timestamp":1623715200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001473","name":"Tote board - Singapore","doi-asserted-by":"publisher","award":["GC62018NUSISS"],"award-info":[{"award-number":["GC62018NUSISS"]}],"id":[{"id":"10.13039\/501100001473","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2022,12]]},"DOI":"10.1007\/s11042-021-11020-w","type":"journal-article","created":{"date-parts":[[2021,6,15]],"date-time":"2021-06-15T10:03:18Z","timestamp":1623751398000},"page":"41661-41676","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["An enhanced self-attention and A2J approach for 3D hand pose estimation"],"prefix":"10.1007","volume":"81","author":[{"given":"Mei-Ying","family":"Ng","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0566-6069","authenticated-orcid":false,"given":"Chin-Boon","family":"Chng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wai-Kin","family":"Koh","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chee-Kong","family":"Chui","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Matthew Chin-Heng","family":"Chua","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2021,6,15]]},"reference":[{"key":"11020_CR1","unstructured":"Bahdanau D, Cho K, Bengio Y (2014) Neural machine translation by jointly learning to align and translate. arXiv:1409.0473"},{"key":"11020_CR2","doi-asserted-by":"crossref","unstructured":"Bello I, Zoph B, Vaswani A, Shlens J, Le QV (2019) Attention augmented convolutional networks. In: Proceedings of the IEEE international conference on computer vision. pp 3286\u20133295","DOI":"10.1109\/ICCV.2019.00338"},{"key":"11020_CR3","unstructured":"Bouchacourt D, Mudigonda PK, Nowozin S (2016) Disco nets: Dissimilarity coefficients networks. In: Advances in neural information processing systems. pp 352\u2013360"},{"key":"11020_CR4","unstructured":"Cejnog LWX, Cesar RM, de Campos TE, Elui VMC (2019) Hand range of motion evaluation for rheumatoid arthritis patients. In: 2019 14th IEEE international conference on automatic face & gesture recognition (FG 2019). IEEE, pp 1\u20135"},{"key":"11020_CR5","doi-asserted-by":"publisher","first-page":"138","DOI":"10.1016\/j.neucom.2018.06.097","volume":"395","author":"X Chen","year":"2020","unstructured":"Chen X, Wang G, Guo H, Zhang C (2020) Pose guided structured region ensemble network for cascaded hand pose estimation. Neurocomputing 395:138\u2013149","journal-title":"Neurocomputing"},{"key":"11020_CR6","doi-asserted-by":"publisher","first-page":"43425","DOI":"10.1109\/ACCESS.2018.2863540","volume":"6","author":"X Chen","year":"2018","unstructured":"Chen X, Wang G, Zhang C, Kim Tae-Kyun, Ji X (2018) Shpr-net: Deep semantic hand pose regression from point clouds. IEEE Access 6:43425\u201343439","journal-title":"IEEE Access"},{"key":"11020_CR7","unstructured":"Deng X, Yang S, Zhang Y, Tan P, Chang L, Wang H (2017) Hand3d: Hand pose estimation using 3d neural network. arXiv:1704.02224"},{"key":"11020_CR8","doi-asserted-by":"publisher","first-page":"68","DOI":"10.1016\/j.neucom.2017.04.014","volume":"251","author":"D Fourure","year":"2017","unstructured":"Fourure D, Emonet R\u00e9mi, Fromont E, Muselet D, Neverova N, Tr\u00e9meau A., Wolf C (2017) Multi-task, multi-domain learning: application to semantic segmentation and pose regression. Neurocomputing 251:68\u201380","journal-title":"Neurocomputing"},{"key":"11020_CR9","doi-asserted-by":"crossref","unstructured":"Garcia-Hernando G, Yuan S, Baek S, Kim T-K (2018) First-person hand action benchmark with rgb-d videos and 3d hand pose annotations. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 409\u2013419","DOI":"10.1109\/CVPR.2018.00050"},{"key":"11020_CR10","doi-asserted-by":"crossref","unstructured":"Ge L, Cai Y, Weng J, Yuan J (2018) Hand pointnet: 3d hand pose estimation using point sets. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 8417\u20138426","DOI":"10.1109\/CVPR.2018.00878"},{"key":"11020_CR11","doi-asserted-by":"crossref","unstructured":"Ge L, Liang H, Yuan J, Thalmann D (2016) Robust 3d hand pose estimation in single depth images: from single-view cnn to multi-view cnns. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 3593\u20133601","DOI":"10.1109\/CVPR.2016.391"},{"key":"11020_CR12","doi-asserted-by":"crossref","unstructured":"Ge L, Ren Z, Yuan J (2018) Point-to-point regression pointnet for 3d hand pose estimation. In: Proceedings of the European conference on computer vision (ECCV). pp 475\u2013491","DOI":"10.1109\/CVPR.2018.00878"},{"key":"11020_CR13","doi-asserted-by":"crossref","unstructured":"Girdhar R, Carreira J, Doersch C, Zisserman A (2019) Video action transformer network. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 244\u2013253","DOI":"10.1109\/CVPR.2019.00033"},{"key":"11020_CR14","doi-asserted-by":"publisher","first-page":"18258","DOI":"10.1109\/ACCESS.2020.2968361","volume":"8","author":"F Guo","year":"2020","unstructured":"Guo F, He Z, Zhang S, Zhao X, Tan J (2020) Attention-based pose sequence machine for 3d hand pose estimation. IEEE Access 8:18258\u201318269","journal-title":"IEEE Access"},{"key":"11020_CR15","doi-asserted-by":"crossref","unstructured":"Guo H, Wang G, Chen X, Zhang C (2017) Towards good practices for deep 3d hand pose estimation. arXiv:1707.07248","DOI":"10.1016\/j.jvcir.2018.04.005"},{"key":"11020_CR16","doi-asserted-by":"crossref","unstructured":"He K, Gkioxari G, Doll\u00e1r P, Girshick R (2017) Mask r-cnn. In: Proceedings of the IEEE international conference on computer vision. pp 2961\u20132969","DOI":"10.1109\/ICCV.2017.322"},{"key":"11020_CR17","unstructured":"Huang L, Yuan Y, Guo J, Zhang C, Chen X, Wang J (2019) Interlaced sparse self-attention for semantic segmentation. arXiv:1907.12273"},{"key":"11020_CR18","doi-asserted-by":"crossref","unstructured":"Imura S, Hosobe H (2018) A hand gesture-based method for biometric authentication. In: International conference on human-computer interaction. Springer, pp 554\u2013566","DOI":"10.1007\/978-3-319-91238-7_43"},{"key":"11020_CR19","first-page":"1097","volume":"25","author":"A Krizhevsky","year":"2012","unstructured":"Krizhevsky A, Sutskever I, Hinton GE (2012) Imagenet classification with deep convolutional neural networks. Adv Neural Inf Process Syst 25:1097\u20131105","journal-title":"Adv Neural Inf Process Syst"},{"key":"11020_CR20","doi-asserted-by":"crossref","unstructured":"Kuo D u, Lin X, Yi S, Ma X (2019) Crossinfonet: Multi-task information sharing based hand pose estimation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 9896\u20139905","DOI":"10.1109\/CVPR.2019.01013"},{"issue":"4","key":"11020_CR21","doi-asserted-by":"publisher","first-page":"541","DOI":"10.1162\/neco.1989.1.4.541","volume":"1","author":"Y LeCun","year":"1989","unstructured":"LeCun Y, Boser B, Denker JS, Henderson D, Howard RE, Hubbard W, Jackel LD (1989) Backpropagation applied to handwritten zip code recognition. Neural Comput 1(4):541\u2013551","journal-title":"Neural Comput"},{"key":"11020_CR22","doi-asserted-by":"crossref","unstructured":"Li W-J, Hsieh C-Y, Lin L-F, Chu W-C (2017) Hand gesture recognition for post-stroke rehabilitation using leap motion. In: 2017 international conference on applied system innovation (ICASI). IEEE, pp 386\u2013388","DOI":"10.1109\/ICASI.2017.7988433"},{"key":"11020_CR23","doi-asserted-by":"crossref","unstructured":"Luong M-T, Pham H, Manning CD (2015) Effective approaches to attention-based neural machine translation. arXiv:1508.04025","DOI":"10.18653\/v1\/D15-1166"},{"key":"11020_CR24","unstructured":"Madadi M, Escalera S, Bar\u00f3 X, Gonzalez J (2017) End-to-end global to local cnn learning for hand pose recovery in depth data. arXiv:1705.09606"},{"key":"11020_CR25","doi-asserted-by":"crossref","unstructured":"Madadi M, Escalera S, Carruesco A, Andujar C, Bar\u00f3 X, Gonzalez J (2017) Occlusion aware hand pose recovery from sequences of depth images. In: 2017 12th IEEE international conference on automatic face & gesture recognition (FG 2017). IEEE, pp 230\u2013237","DOI":"10.1109\/FG.2017.37"},{"key":"11020_CR26","unstructured":"Moon G, Ju YC, Lee KM (2018) V2v-posenet: Voxel-to-voxel prediction network for accurate 3d hand and human pose estimation from a single depth map. In: Proceedings of the IEEE conference on computer vision and pattern Recognition. pp 5079\u20135088"},{"key":"11020_CR27","doi-asserted-by":"crossref","unstructured":"Oberweger M, Lepetit V (2017) Deepprior+ +: Improving fast and accurate 3d hand pose estimation. In: Proceedings of the IEEE international conference on computer vision workshops. pp 585\u2013594","DOI":"10.1109\/ICCVW.2017.75"},{"key":"11020_CR28","unstructured":"Oberweger M, Wohlhart P, Lepetit V (2015) Hands deep in deep learning for hand pose estimation. arXiv:1502.06807"},{"key":"11020_CR29","unstructured":"Parmar N, Vaswani A, Uszkoreit J, Kaiser \u0141ukasz, Shazeer N, Alexander K u, Tran D (2018) Image transformer. arXiv:1802.05751"},{"key":"11020_CR30","doi-asserted-by":"crossref","unstructured":"Poier G, Opitz M, Schinagl D, Bischof H (2019) Murauer: Mapping unlabeled real data for label austerity. In: 2019 IEEE winter conference on applications of computer vision (WACV). IEEE, pp 1393\u20131402","DOI":"10.1109\/WACV.2019.00153"},{"key":"11020_CR31","unstructured":"Ramachandran P, Parmar N, Vaswani A, Bello I, Levskaya A, Shlens J (2019) Stand-alone self-attention in vision models. arXiv:1906.05909"},{"key":"11020_CR32","unstructured":"Ren P, Sun H, Qi Q i, Wang J, Huang W (2019) Srn: Stacked regression network for real-time 3d hand pose estimation. In: BMVC, page 112"},{"key":"11020_CR33","doi-asserted-by":"crossref","unstructured":"Showers A, Si M (2018) Pointing estimation for human-robot interaction using hand pose, verbal cues, and confidence heuristics. In: International conference on social computing and social media. Springer, pp 403\u2013412","DOI":"10.1007\/978-3-319-91485-5_31"},{"key":"11020_CR34","doi-asserted-by":"crossref","unstructured":"Sun X, Wei Y, Liang S, Tang X, Sun J (2015) Cascaded hand pose regression. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 824\u2013832","DOI":"10.1109\/CVPR.2015.7298683"},{"key":"11020_CR35","doi-asserted-by":"crossref","unstructured":"Tang D, Chang HJ, Tejani A, Kim T-K (2014) Latent regression forest: Structured estimation of 3d articulated hand posture. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 3786\u20133793","DOI":"10.1109\/CVPR.2014.490"},{"key":"11020_CR36","doi-asserted-by":"publisher","first-page":"202","DOI":"10.1016\/j.neucom.2020.07.078","volume":"417","author":"Y Tian","year":"2020","unstructured":"Tian Y, Zhang Y, Di Z, Cheng G, Chen W-G, Wang R (2020) Triple attention network for video segmentation. Neurocomputing 417:202\u2013211","journal-title":"Neurocomputing"},{"issue":"5","key":"11020_CR37","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2629500","volume":"33","author":"J Tompson","year":"2014","unstructured":"Tompson J, Stein M, Lecun Y, Perlin K (2014) Real-time continuous pose recovery of human hands using convolutional networks. ACM Trans Graph (ToG) 33(5):1\u201310","journal-title":"ACM Trans Graph (ToG)"},{"key":"11020_CR38","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. In: Advances in neural information processing systems. pp 5998\u20136008"},{"key":"11020_CR39","doi-asserted-by":"crossref","unstructured":"Wan C, Probst T, Gool LV, Yao A (2017) Crossing nets: Combining gans and vaes with a shared latent space for hand pose estimation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 680\u2013689","DOI":"10.1109\/CVPR.2017.132"},{"key":"11020_CR40","doi-asserted-by":"crossref","unstructured":"Wan C, Probst T, Gool LV, Yao A (2018) Dense 3d regression for hand pose estimation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 5147\u20135156","DOI":"10.1109\/CVPR.2018.00540"},{"issue":"2","key":"11020_CR41","doi-asserted-by":"publisher","first-page":"618","DOI":"10.3390\/app10020618","volume":"10","author":"X Wang","year":"2020","unstructured":"Wang X, Jiang J, Guo Y, Kang L, Wei Y, Li D (2020) Cfam: Estimating 3d hand poses from a single rgb image with attention. Appl Sci 10(2):618","journal-title":"Appl Sci"},{"key":"11020_CR42","doi-asserted-by":"crossref","unstructured":"Xiong F, Zhang B, Xiao Y, Cao Z, Yu T, Zhou JT, Yuan J (2019) A2j: Anchor-to-joint regression network for 3d articulated pose estimation from a single depth image. In: Proceedings of the IEEE international conference on computer vision. pp 793\u2013802","DOI":"10.1109\/ICCV.2019.00088"},{"issue":"3","key":"11020_CR43","doi-asserted-by":"publisher","first-page":"454","DOI":"10.1007\/s11263-017-0998-6","volume":"123","author":"C Xu","year":"2017","unstructured":"Xu C, Govindarajan LN, Yu Z, Li C (2017) Lie-x: Depth image based articulated object pose estimation, tracking, and action recognition on lie groups. Int J Comput Vis 123(3):454\u2013478","journal-title":"Int J Comput Vis"},{"key":"11020_CR44","doi-asserted-by":"crossref","unstructured":"Yuan S, Garcia-Hernando G, Stenger B, Moon G, Ju YC, Kyoung ML, Molchanov P, Kautz J, Honari S, Ge L et al (2018) Depth-based 3d hand pose estimation: From current achievements to future goals. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp 2636\u20132645","DOI":"10.1109\/CVPR.2018.00279"},{"key":"11020_CR45","unstructured":"Yuan S, Qi Y, Garcia-Hernando G, Kim T-K (2017) The 2017 hands in the million challenge on 3d hand pose estimation. arXiv:1707.02237"},{"key":"11020_CR46","doi-asserted-by":"crossref","unstructured":"Yuan S, Ye Q, Stenger B, Jain S, Kim T-K (2017) Bighand2. 2m benchmark: Hand pose dataset and state of the art analysis. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 4866\u20134874","DOI":"10.1109\/CVPR.2017.279"},{"key":"11020_CR47","unstructured":"Zhang H, Goodfellow I, Metaxas D, Odena A (2019) Self-attention generative adversarial networks. In: International conference on machine learning. PMLR, pp 7354\u20137363"},{"key":"11020_CR48","doi-asserted-by":"crossref","unstructured":"Zhang Y, Meruvia-Pastor O (2017) Operating virtual panels with hand gestures in immersive vr games. In: International conference on augmented reality, virtual reality and computer graphics. Springer, pp 299\u2013308","DOI":"10.1007\/978-3-319-60922-5_24"},{"key":"11020_CR49","unstructured":"Zhou X, Wan Q, Zhang W, Xue X, Wei Y (2016) Model-based deep hand pose estimation. arXiv:1606.06854"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-021-11020-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-021-11020-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-021-11020-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,11,26]],"date-time":"2022-11-26T22:40:24Z","timestamp":1669502424000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-021-11020-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,6,15]]},"references-count":49,"journal-issue":{"issue":"29","published-print":{"date-parts":[[2022,12]]}},"alternative-id":["11020"],"URL":"https:\/\/doi.org\/10.1007\/s11042-021-11020-w","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"type":"print","value":"1380-7501"},{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2021,6,15]]},"assertion":[{"value":"5 October 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 February 2021","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 May 2021","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 June 2021","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"<!--Emphasis Type='Bold' removed-->Conflict of Interests"}}]}}