{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,18]],"date-time":"2026-06-18T15:44:05Z","timestamp":1781797445709,"version":"3.54.5"},"publisher-location":"Cham","reference-count":88,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726972","type":"print"},{"value":"9783031726989","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72698-9_25","type":"book-chapter","created":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T04:45:57Z","timestamp":1729831557000},"page":"428-448","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":16,"title":["Benchmarks and\u00a0Challenges in\u00a0Pose Estimation for\u00a0Egocentric Hand Interactions with\u00a0Objects"],"prefix":"10.1007","author":[{"given":"Zicong","family":"Fan","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Takehiko","family":"Ohkawa","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Linlin","family":"Yang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Nie","family":"Lin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhishan","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shihao","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jiajun","family":"Liang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhong","family":"Gao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xuanyang","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xue","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Fei","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zheng","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Feng","family":"Lu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Karim Abou","family":"Zeid","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bastian","family":"Leibe","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jeongwan","family":"On","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Seungryul","family":"Baek","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Aditya","family":"Prakash","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Saurabh","family":"Gupta","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kun","family":"He","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yoichi","family":"Sato","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Otmar","family":"Hilliges","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hyung Jin","family":"Chang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Angela","family":"Yao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,10,26]]},"reference":[{"key":"25_CR1","unstructured":"Abou\u00a0Zeid, K.: JointTransformer: Winner of the HANDS\u20192023 ARCTIC Challenge @ ICCV (2023). https:\/\/github.com\/kabouzeid\/JointTransformer"},{"key":"25_CR2","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"85","DOI":"10.1007\/978-3-030-58592-1_6","volume-title":"Computer Vision \u2013 ECCV 2020","author":"A Armagan","year":"2020","unstructured":"Armagan, A., et al.: Measuring generalisation to unseen viewpoints, articulations, shapes and\u00a0objects for 3D hand pose estimation under hand-object interaction. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12368, pp. 85\u2013101. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58592-1_6"},{"key":"25_CR3","doi-asserted-by":"crossref","unstructured":"Boukhayma, A., de\u00a0Bem, R., Torr, P.H.S.: 3D hand shape and pose from images in the wild. In: Computer Vision and Pattern Recognition (CVPR), pp. 10843\u201310852 (2019)","DOI":"10.1109\/CVPR.2019.01110"},{"key":"25_CR4","doi-asserted-by":"crossref","unstructured":"Cai, Y., Ge, L., Cai, J., Yuan, J.: Weakly-supervised 3D hand pose estimation from monocular RGB images. In: European Conference on Computer Vision (ECCV), pp. 678\u2013694 (2018)","DOI":"10.1007\/978-3-030-01231-1_41"},{"key":"25_CR5","doi-asserted-by":"crossref","unstructured":"Cao, Z., Radosavovic, I., Kanazawa, A., Malik, J.: Reconstructing hand-object interactions in the wild. In: International Conference on Computer Vision (ICCV), pp. 12417\u201312426 (2021)","DOI":"10.1109\/ICCV48922.2021.01219"},{"key":"25_CR6","doi-asserted-by":"crossref","unstructured":"Chao, Y.W., et al.: DexYCB: a benchmark for capturing hand grasping of objects. In: Computer Vision and Pattern Recognition (CVPR), pp. 9044\u20139053 (2021)","DOI":"10.1109\/CVPR46437.2021.00893"},{"key":"25_CR7","unstructured":"Chatterjee, D., Sener, F., Ma, S., Yao, A.: Opening the vocabulary of egocentric actions. In: Conference on Neural Information Processing Systems (NeurIPS), vol. 36 (2024)"},{"key":"25_CR8","doi-asserted-by":"crossref","unstructured":"Chen, X., et al.: MobRecon: mobile-friendly hand mesh reconstruction from monocular image. In: Computer Vision and Pattern Recognition (CVPR), pp. 20512\u201320522 (2022)","DOI":"10.1109\/CVPR52688.2022.01989"},{"key":"25_CR9","doi-asserted-by":"crossref","unstructured":"Chen, X., Wang, B., Shum, H.Y.: Hand avatar: free-pose hand animation and rendering from monocular video. In: Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00839"},{"key":"25_CR10","doi-asserted-by":"crossref","unstructured":"Chen, Z., Chen, S., Schmid, C., Laptev, I.: gSDF: geometry-driven signed distance functions for 3D hand-object reconstruction. In: Computer Vision and Pattern Recognition (CVPR), pp. 12890\u201312900 (2023)","DOI":"10.1109\/CVPR52729.2023.01239"},{"key":"25_CR11","doi-asserted-by":"crossref","unstructured":"Christen, S., Kocabas, M., Aksan, E., Hwangbo, J., Song, J., Hilliges, O.: D-Grasp: physically plausible dynamic grasp synthesis for hand-object interactions. In: Computer Vision and Pattern Recognition (CVPR), pp. 20545\u201320554 (2022)","DOI":"10.1109\/CVPR52688.2022.01992"},{"key":"25_CR12","doi-asserted-by":"crossref","unstructured":"Corona, E., Pumarola, A., Aleny\u00e0, G., Moreno-Noguer, F., Rogez, G.: GanHand: predicting human grasp affordances in multi-object scenes. In: Computer Vision and Pattern Recognition (CVPR), pp. 5030\u20135040 (2020)","DOI":"10.1109\/CVPR42600.2020.00508"},{"key":"25_CR13","doi-asserted-by":"crossref","unstructured":"Duran, E., Kocabas, M., Choutas, V., Fan, Z., Black, M.J.: HMP: hand motion priors for pose and shape estimation from video. In: Winter Conference on Applications of Computer Vision (WACV) (2024)","DOI":"10.1109\/WACV57701.2024.00623"},{"issue":"1\u20132","key":"25_CR14","first-page":"52","volume":"108","author":"A Erol","year":"2007","unstructured":"Erol, A., Bebis, G., Nicolescu, M., Boyle, R.D., Twombly, X.: Vision-based hand pose estimation: a review. CVIU 108(1\u20132), 52\u201373 (2007)","journal-title":"CVIU"},{"key":"25_CR15","doi-asserted-by":"crossref","unstructured":"Fan, Z., et al.: HOLD: category-agnostic 3D reconstruction of interacting hands and objects from video. In: Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.00054"},{"key":"25_CR16","doi-asserted-by":"crossref","unstructured":"Fan, Z., Spurr, A., Kocabas, M., Tang, S., Black, M.J., Hilliges, O.: Learning to disambiguate strongly interacting hands via probabilistic per-pixel part segmentation. In: International Conference on 3D Vision (3DV), pp. 1\u201310 (2021)","DOI":"10.1109\/3DV53792.2021.00011"},{"key":"25_CR17","doi-asserted-by":"crossref","unstructured":"Fan, Z., et al.: ARCTIC: a dataset for dexterous bimanual hand-object manipulation. In: Proceedings IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.01244"},{"key":"25_CR18","doi-asserted-by":"crossref","unstructured":"Fu, Q., Liu, X., Xu, R., Niebles, J.C., Kitani, K.M.: Deformer: dynamic fusion transformer for robust hand pose estimation. In: International Conference on Computer Vision (ICCV), pp. 23600\u201323611 (2023)","DOI":"10.1109\/ICCV51070.2023.02157"},{"key":"25_CR19","doi-asserted-by":"crossref","unstructured":"Garcia-Hernando, G., Yuan, S., Baek, S., Kim, T.K.: First-person hand action benchmark with RGB-D videos and 3D hand pose annotations. In: Computer Vision and Pattern Recognition (CVPR) (2018)","DOI":"10.1109\/CVPR.2018.00050"},{"key":"25_CR20","doi-asserted-by":"crossref","unstructured":"Ge, L., et al.: 3D hand shape and pose estimation from a single RGB image. In: Computer Vision and Pattern Recognition (CVPR), pp. 10833\u201310842 (2019)","DOI":"10.1109\/CVPR.2019.01109"},{"key":"25_CR21","doi-asserted-by":"crossref","unstructured":"Grady, P., Tang, C., Twigg, C.D., Vo, M., Brahmbhatt, S., Kemp, C.C.: ContactOpt: optimizing contact to improve grasps. In: Computer Vision and Pattern Recognition (CVPR), pp. 1471\u20131481 (2021)","DOI":"10.1109\/CVPR46437.2021.00152"},{"key":"25_CR22","doi-asserted-by":"crossref","unstructured":"Guo, Z., Zhou, W., Wang, M., Li, L., Li, H.: HandNeRF: neural radiance fields for animatable interacting hands. In: Computer Vision and Pattern Recognition (CVPR), pp. 21078\u201321087 (2023)","DOI":"10.1109\/CVPR52729.2023.02019"},{"key":"25_CR23","doi-asserted-by":"crossref","unstructured":"Hampali, S., Rad, M., Oberweger, M., Lepetit, V.: HOnnotate: a method for 3D annotation of hand and object poses. In: Computer Vision and Pattern Recognition (CVPR), pp. 3193\u20133203 (2020)","DOI":"10.1109\/CVPR42600.2020.00326"},{"key":"25_CR24","doi-asserted-by":"crossref","unstructured":"Hampali, S., Sarkar, S.D., Rad, M., Lepetit, V.: Keypoint transformer: solving joint identification in challenging hands and object interactions for accurate 3D pose estimation. In: Computer Vision and Pattern Recognition (CVPR), pp. 11090\u201311100 (2022)","DOI":"10.1109\/CVPR52688.2022.01081"},{"key":"25_CR25","doi-asserted-by":"crossref","unstructured":"Han, S., et al..: UmeTrack: unified multi-view end-to-end hand tracking for VR. In: International Conference on Computer Graphics and Interactive Techniques (SIGGRAPH), pp. 50:1\u201350:9. ACM (2022)","DOI":"10.1145\/3550469.3555378"},{"key":"25_CR26","doi-asserted-by":"crossref","unstructured":"Hasson, Y., Tekin, B., Bogo, F., Laptev, I., Pollefeys, M., Schmid, C.: Leveraging photometric consistency over time for sparsely supervised hand-object reconstruction. In: Computer Vision and Pattern Recognition (CVPR), pp. 568\u2013577 (2020)","DOI":"10.1109\/CVPR42600.2020.00065"},{"key":"25_CR27","doi-asserted-by":"crossref","unstructured":"Hasson, Y., Varol, G., Schmid, C., Laptev, I.: Towards unconstrained joint hand-object reconstruction from RGB videos. In: International Conference on 3D Vision (3DV), pp. 659\u2013668. IEEE (2021)","DOI":"10.1109\/3DV53792.2021.00075"},{"key":"25_CR28","doi-asserted-by":"crossref","unstructured":"Hasson, Y., et al.: Learning joint reconstruction of hands and manipulated objects. In: Computer Vision and Pattern Recognition (CVPR), pp. 11807\u201311816 (2019)","DOI":"10.1109\/CVPR.2019.01208"},{"key":"25_CR29","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Computer Vision and Pattern Recognition (CVPR), pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"25_CR30","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.B.: Masked autoencoders are scalable vision learners. In: Computer Vision and Pattern Recognition (CVPR), pp. 15979\u201315988 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"25_CR31","doi-asserted-by":"crossref","unstructured":"Huang, D., et al.: Reconstructing hand-held objects from monocular video. In: SIGGRAPH Asia 2022 Conference Papers, pp.\u00a01\u20139 (2022)","DOI":"10.1145\/3550469.3555401"},{"key":"25_CR32","doi-asserted-by":"crossref","unstructured":"Iqbal, U., Molchanov, P., Gall, T.B.J., Kautz, J.: Hand pose estimation via latent 2.5D heatmap regression. In: European Conference on Computer Vision (ECCV), pp. 118\u2013134 (2018)","DOI":"10.1007\/978-3-030-01252-6_8"},{"key":"25_CR33","doi-asserted-by":"crossref","unstructured":"Kwon, T., Tekin, B., St\u00fchmer, J., Bogo, F., Pollefeys, M.: H2O: two hands manipulating objects for first person interaction recognition. In: International Conference on Computer Vision (ICCV), pp. 10138\u201310148 (2021)","DOI":"10.1109\/ICCV48922.2021.00998"},{"key":"25_CR34","doi-asserted-by":"crossref","unstructured":"Lee, J., Sung, M., Choi, H., Kim, T.K.: Im2Hands: learning attentive implicit representation of interacting two-hand shapes. In: Computer Vision and Pattern Recognition (CVPR), pp. 21169\u201321178 (2023)","DOI":"10.1109\/CVPR52729.2023.02028"},{"key":"25_CR35","doi-asserted-by":"crossref","unstructured":"Li, L., et al.: RenderIH: a large-scale synthetic dataset for 3D interacting hand pose estimation. In: International Conference on Computer Vision (ICCV), pp. 20395\u201320405 (2023)","DOI":"10.1109\/ICCV51070.2023.01865"},{"key":"25_CR36","doi-asserted-by":"crossref","unstructured":"Li, M., et al.: Interacting attention graph for single image two-hand reconstruction. In: Computer Vision and Pattern Recognition (CVPR), pp. 2761\u20132770 (2022)","DOI":"10.1109\/CVPR52688.2022.00278"},{"key":"25_CR37","doi-asserted-by":"crossref","unstructured":"Liu, R., Ohkawa, T., Zhang, M., Sato, Y.: Single-to-dual-view adaptation for egocentric 3D hand pose estimation. In: Computer Vision and Pattern Recognition (CVPR), pp. 677\u2013686 (2024)","DOI":"10.1109\/CVPR52733.2024.00071"},{"key":"25_CR38","doi-asserted-by":"crossref","unstructured":"Liu, R., Wu, R., Hoorick, B.V., Tokmakov, P., Zakharov, S., Vondrick, C.: Zero-1-to-3: zero-shot one image to 3D object (2023)","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"25_CR39","doi-asserted-by":"crossref","unstructured":"Liu, S., Jiang, H., Xu, J., Liu, S., Wang, X.: Semi-supervised 3D hand-object poses estimation with interactions in time. In: Computer Vision and Pattern Recognition (CVPR), pp. 14687\u201314697 (2021)","DOI":"10.1109\/CVPR46437.2021.01445"},{"key":"25_CR40","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: International Conference on Computer Vision (ICCV), pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"25_CR41","doi-asserted-by":"crossref","unstructured":"Liu, Z., Mao, H., Wu, C., Feichtenhofer, C., Darrell, T., Xie, S.: A convnet for the 2020s. In: Computer Vision and Pattern Recognition (CVPR), pp. 11966\u201311976 (2022)","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"25_CR42","doi-asserted-by":"publisher","unstructured":"Meng, H. et al.: 3D interacting hand pose estimation by hand de-occlusion and removal. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022. ECCV 2022. LNCS, vol. 13666. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20068-7_22","DOI":"10.1007\/978-3-031-20068-7_22"},{"key":"25_CR43","doi-asserted-by":"crossref","unstructured":"Moon, G.: Bringing inputs to shared domains for 3D interacting hands recovery in the wild. In: Computer Vision and Pattern Recognition (CVPR), pp. 17028\u201317037 (2023)","DOI":"10.1109\/CVPR52729.2023.01633"},{"key":"25_CR44","unstructured":"Moon, G., et\u00a0al.: A dataset of relighted 3D interacting hands. In: Conference on Neural Information Processing Systems (NeurIPS) 36 (2024)"},{"key":"25_CR45","doi-asserted-by":"publisher","unstructured":"Moon, G., Yu, S.-I., Wen, H., Shiratori, T., Lee, K.M.: InterHand2.6M: a dataset and baseline for 3D interacting hand pose estimation from a single RGB image. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12365, pp. 548\u2013564. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58565-5_33","DOI":"10.1007\/978-3-030-58565-5_33"},{"key":"25_CR46","doi-asserted-by":"crossref","unstructured":"Mueller, F., et al.: GANerated hands for real-time 3D hand tracking from monocular RGB. In: Computer Vision and Pattern Recognition (CVPR), pp. 49\u201359 (2018)","DOI":"10.1109\/CVPR.2018.00013"},{"key":"25_CR47","unstructured":"Ohkawa, T.: AssemblyHands toolkit (2023). https:\/\/github.com\/facebookresearch\/assemblyhands-toolkit"},{"key":"25_CR48","doi-asserted-by":"crossref","unstructured":"Ohkawa, T., Furuta, R., Sato, Y.: Efficient annotation and learning for 3D hand pose estimation: a survey. IJCV 131, 3193\u20133206 (2023)","DOI":"10.1007\/s11263-023-01856-0"},{"key":"25_CR49","doi-asserted-by":"crossref","unstructured":"Ohkawa, T., He, K., Sener, F., Hodan, T., Tran, L., Keskin, C.: AssemblyHands: towards egocentric activity understanding via 3D hand pose estimation. In: Computer Vision and Pattern Recognition (CVPR), pp. 12999\u201313008 (2023)","DOI":"10.1109\/CVPR52729.2023.01249"},{"key":"25_CR50","doi-asserted-by":"crossref","unstructured":"Ohkawa, T., Li, Y.J., Fu, Q., Furuta, R., Kitani, K.M., Sato, Y.: Domain adaptive hand keypoint and pixel localization in the wild. In: European Conference on Computer Vision (ECCV), pp. 68\u201487 (2022)","DOI":"10.1007\/978-3-031-20077-9_5"},{"key":"25_CR51","doi-asserted-by":"crossref","unstructured":"Ohkawa, T., Yagi, T., Hashimoto, A., Ushiku, Y., Sato, Y.: Foreground-aware stylization and consensus pseudo-labeling for domain adaptation of first-person hand segmentation. IEEE Access 9, 94644\u201394655 (2021)","DOI":"10.1109\/ACCESS.2021.3094052"},{"key":"25_CR52","unstructured":"Oquab, M., et al.: DINOv2: learning robust visual features without supervision (2023)"},{"key":"25_CR53","doi-asserted-by":"crossref","unstructured":"Park, J., Oh, Y., Moon, G., Choi, H., Lee, K.M.: HandOccNet: occlusion-robust 3D hand mesh estimation network. In: Computer Vision and Pattern Recognition (CVPR), pp. 1496\u20131505 (2022)","DOI":"10.1109\/CVPR52688.2022.00155"},{"key":"25_CR54","unstructured":"Poole, B., Jain, A., Barron, J.T., Mildenhall, B.: DreamFusion: text-to-3D using 2D diffusion. arXiv preprint arXiv:2209.14988 (2022)"},{"key":"25_CR55","doi-asserted-by":"crossref","unstructured":"Prakash, A., Tu, R., Chang, M., Gupta, S.: 3D hand pose estimation in everyday egocentric images. In: European Conference on Computer Vision (ECCV) (2024)","DOI":"10.1007\/978-3-031-73229-4_11"},{"key":"25_CR56","doi-asserted-by":"crossref","unstructured":"Radosavovic, I., Kosaraju, R.P., Girshick, R.B., He, K., Doll\u00e1r, P.: Designing network design spaces. In: Computer Vision and Pattern Recognition (CVPR), pp. 10425\u201310433 (2020)","DOI":"10.1109\/CVPR42600.2020.01044"},{"key":"25_CR57","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1007\/BFb0028333","volume-title":"Computer Vision \u2014 ECCV \u201994","author":"JM Rehg","year":"1994","unstructured":"Rehg, J.M., Kanade, T.: Visual tracking of high DOF articulated structures: an application to human hand tracking. In: Eklundh, J.O., et al. (eds.) ECCV \u201994, pp. 35\u201346. Springer, Berlin, Heidelberg (1994). https:\/\/doi.org\/10.1007\/BFb0028333"},{"key":"25_CR58","doi-asserted-by":"crossref","unstructured":"Remelli, E., Han, S., Honari, S., Fua, P., Wang, R.: Lightweight multi-view 3D pose estimation through camera-disentangled representation. In: Computer Vision and Pattern Recognition (CVPR), pp. 6039\u20136048 (2020)","DOI":"10.1109\/CVPR42600.2020.00608"},{"key":"25_CR59","doi-asserted-by":"crossref","unstructured":"Romero, J., Tzionas, D., Black, M.J.: Embodied hands: modeling and capturing hands and bodies together. ACM TOG 36(6), 245:1\u2013245:17 (2017)","DOI":"10.1145\/3130800.3130883"},{"key":"25_CR60","unstructured":"Ryali, C., et al.: Hiera: a hierarchical vision transformer without the bells-and-whistles. In: International Conference on Machine Learning (ICML), vol.\u00a0202, pp. 29441\u201329454 (2023)"},{"key":"25_CR61","doi-asserted-by":"crossref","unstructured":"Sener, F., et al.: Assembly101: a large-scale multi-view video dataset for understanding procedural activities. In: Computer Vision and Pattern Recognition (CVPR), pp. 21064\u201321074 (2022)","DOI":"10.1109\/CVPR52688.2022.02042"},{"key":"25_CR62","doi-asserted-by":"crossref","unstructured":"Shamil, M.S., Chatterjee, D., Sener, F., Ma, S., Yao, A.: On the utility of 3D hand poses for action recognition. In: European Conference on Computer Vision (ECCV) (2024)","DOI":"10.1007\/978-3-031-72658-3_25"},{"key":"25_CR63","doi-asserted-by":"crossref","unstructured":"Simon, T., Joo, H., Matthews, I., Sheikh, Y.: Hand keypoint detection in single images using multiview bootstrapping. In: Computer Vision and Pattern Recognition (CVPR), pp. 4645\u20134653 (2017)","DOI":"10.1109\/CVPR.2017.494"},{"key":"25_CR64","doi-asserted-by":"crossref","unstructured":"Spurr, A., Dahiya, A., Wang, X., Zhang, X., Hilliges, O.: Self-supervised 3D hand pose estimation from monocular RGB via contrastive learning. In: International Conference on Computer Vision (ICCV), pp. 11210\u201311219 (2021)","DOI":"10.1109\/ICCV48922.2021.01104"},{"key":"25_CR65","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/978-3-030-58520-4_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"A Spurr","year":"2020","unstructured":"Spurr, A., Iqbal, U., Molchanov, P., Hilliges, O., Kautz, J.: Weakly supervised 3D hand pose estimation via biomechanical constraints. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12362, pp. 211\u2013228. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58520-4_13"},{"key":"25_CR66","doi-asserted-by":"crossref","unstructured":"Spurr, A., Song, J., Park, S., Hilliges, O.: Cross-modal deep variational hand pose estimation. In: Computer Vision and Pattern Recognition (CVPR), pp. 89\u201398 (2018)","DOI":"10.1109\/CVPR.2018.00017"},{"key":"25_CR67","doi-asserted-by":"crossref","unstructured":"Sun, K., Xiao, B., Liu, D., Wang, J.: Deep high-resolution representation learning for human pose estimation. In: Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.00584"},{"key":"25_CR68","doi-asserted-by":"crossref","unstructured":"Swamy, A., et al.: SHOWMe: benchmarking object-agnostic hand-object 3D reconstruction. In: International Conference on Computer Vision (ICCV), pp. 1935\u20131944 (2023)","DOI":"10.1016\/j.cviu.2024.104073"},{"key":"25_CR69","doi-asserted-by":"crossref","unstructured":"Tekin, B., Bogo, F., Pollefeys, M.: H+O: unified egocentric recognition of 3D hand-object poses and interactions. In: Computer Vision and Pattern Recognition (CVPR), pp. 4511\u20134520 (2019)","DOI":"10.1109\/CVPR.2019.00464"},{"key":"25_CR70","doi-asserted-by":"crossref","unstructured":"Tse, T.H.E., Kim, K.I., Leonardis, A., Chang, H.J.: Collaborative learning for hand and object reconstruction with attention-guided graph convolution. In: Computer Vision and Pattern Recognition (CVPR), pp. 1664\u20131674 (2022)","DOI":"10.1109\/CVPR52688.2022.00171"},{"key":"25_CR71","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"131","DOI":"10.1007\/978-3-642-40602-7_14","volume-title":"Pattern Recognition","author":"D Tzionas","year":"2013","unstructured":"Tzionas, D., Gall, J.: A comparison of directional distances for hand pose estimation. In: Weickert, J., Hein, M., Schiele, B. (eds.) GCPR 2013. LNCS, vol. 8142, pp. 131\u2013141. Springer, Heidelberg (2013). https:\/\/doi.org\/10.1007\/978-3-642-40602-7_14"},{"key":"25_CR72","unstructured":"Wen, Y., et al.: Generative hierarchical temporal transformer for hand action recognition and motion prediction. arXiv preprint arXiv:2311.17366 (2023)"},{"key":"25_CR73","doi-asserted-by":"crossref","unstructured":"Yang, L., Chen, S., Yao, A.: SemiHand: semi-supervised hand pose estimation with consistency. In: International Conference on Computer Vision (ICCV), pp. 11364\u201311373 (2021)","DOI":"10.1109\/ICCV48922.2021.01117"},{"key":"25_CR74","doi-asserted-by":"crossref","unstructured":"Yang, L., Zhan, X., Li, K., Xu, W., Li, J., Lu, C.: CPF: learning a contact potential field to model the hand-object interaction. In: International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.01091"},{"key":"25_CR75","doi-asserted-by":"crossref","unstructured":"Ye, Y., Gupta, A., Tulsiani, S.: What\u2019s in your hands? 3D reconstruction of generic objects in hands. In: Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.00387"},{"key":"25_CR76","doi-asserted-by":"crossref","unstructured":"Ye, Y., Hebbar, P., Gupta, A., Tulsiani, S.: Diffusion-guided reconstruction of everyday hand-object interaction clips. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01806"},{"key":"25_CR77","doi-asserted-by":"crossref","unstructured":"Yuan, S., et al.: Depth-based 3D hand pose estimation: from current achievements to future goals. In: Computer Vision and Pattern Recognition (CVPR), pp. 2636\u20132645 (2018)","DOI":"10.1109\/CVPR.2018.00279"},{"key":"25_CR78","doi-asserted-by":"crossref","unstructured":"Zhai, X., Kolesnikov, A., Houlsby, N., Beyer, L.: Scaling vision transformers. In: Computer Vision and Pattern Recognition (CVPR), pp. 12104\u201312113 (2022)","DOI":"10.1109\/CVPR52688.2022.01179"},{"key":"25_CR79","doi-asserted-by":"crossref","unstructured":"Zhang, H., Christen, S., Fan, Z., Hilliges, O., Song, J.: GraspXL: generating grasping motions for diverse objects at scale. In: European Conference on Computer Vision (ECCV) (2024)","DOI":"10.1007\/978-3-031-73347-5_22"},{"key":"25_CR80","doi-asserted-by":"crossref","unstructured":"Zhang, H., et al.: ArtiGrasp: physically plausible synthesis of bi-manual dexterous grasping and articulation. In: International Conference on 3D Vision (3DV) (2024)","DOI":"10.1109\/3DV62453.2024.00016"},{"key":"25_CR81","doi-asserted-by":"crossref","unstructured":"Zhang, X., Li, Q., Mo, H., Zhang, W., Zheng, W.: End-to-end hand mesh recovery from a monocular RGB image. In: International Conference on Computer Vision (ICCV), pp. 2354\u20132364 (2019)","DOI":"10.1109\/ICCV.2019.00244"},{"key":"25_CR82","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Habermann, M., Xu, W., Habibie, I., Theobalt, C., Xu, F.: Monocular real-time hand shape and motion capture using multi-modal data. In: Computer Vision and Pattern Recognition (CVPR), pp. 5345\u20135354 (2020)","DOI":"10.1109\/CVPR42600.2020.00539"},{"key":"25_CR83","unstructured":"Zhou, Z.: SimpleHand: Winner of the HANDS\u20192023 AssemblyHands Challenge @ ICCV (2024). https:\/\/github.com\/patienceFromZhou\/simpleHand"},{"key":"25_CR84","unstructured":"Zhou, Z., et al.: 1st place solution of egocentric 3D hand pose estimation challenge 2023 technical report: a concise pipeline for egocentric hand pose reconstruction. arXiv preprint arXiv:2310.04769 (2023)"},{"key":"25_CR85","doi-asserted-by":"crossref","unstructured":"Zhou, Z., Zhou, S., Lv, Z., Zou, M., Tang, Y., Liang, J.: A simple baseline for efficient hand mesh reconstruction. In: Computer Vision and Pattern Recognition (CVPR), pp. 1367\u20131376 (2024)","DOI":"10.1109\/CVPR52733.2024.00136"},{"key":"25_CR86","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable DETR: deformable transformers for end-to-end object detection. In: International Conference on Learning Representations (ICLR) (2021). https:\/\/openreview.net\/forum?id=gZ9hCDWe6ke"},{"key":"25_CR87","doi-asserted-by":"crossref","unstructured":"Ziani, A., Fan, Z., Kocabas, M., Christen, S., Hilliges, O.: TempCLR: reconstructing hands via time-coherent contrastive learning. In: International Conference on 3D Vision (3DV), pp. 627\u2013636 (2022)","DOI":"10.1109\/3DV57658.2022.00073"},{"key":"25_CR88","doi-asserted-by":"crossref","unstructured":"Zimmermann, C., Brox, T.: Learning to estimate 3D hand pose from single RGB images. In: International Conference on Computer Vision (ICCV), pp. 4913\u20134921 (2017)","DOI":"10.1109\/ICCV.2017.525"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72698-9_25","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T07:21:33Z","timestamp":1732951293000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72698-9_25"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,26]]},"ISBN":["9783031726972","9783031726989"],"references-count":88,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72698-9_25","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,26]]},"assertion":[{"value":"26 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}