{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T15:16:13Z","timestamp":1759331773480,"version":"3.40.3"},"publisher-location":"Cham","reference-count":44,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031730269"},{"type":"electronic","value":"9783031730276"}],"license":[{"start":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T00:00:00Z","timestamp":1732579200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T00:00:00Z","timestamp":1732579200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73027-6_19","type":"book-chapter","created":{"date-parts":[[2024,11,25]],"date-time":"2024-11-25T17:54:29Z","timestamp":1732557269000},"page":"329-345","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["AttentionHand: Text-Driven Controllable Hand Image Generation for\u00a03D Hand Reconstruction in\u00a0the\u00a0Wild"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-3474-0010","authenticated-orcid":false,"given":"Junho","family":"Park","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1135-7502","authenticated-orcid":false,"given":"Kyeongbo","family":"Kong","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4809-956X","authenticated-orcid":false,"given":"Suk-Ju","family":"Kang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,26]]},"reference":[{"key":"19_CR1","doi-asserted-by":"crossref","unstructured":"Hampali, S., Rad, M., Oberweger, M., Lepetit, V., HOnnotate: a method for 3D annotation of hand and object poses. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 3196\u20133206 (2020)","DOI":"10.1109\/CVPR42600.2020.00326"},{"key":"19_CR2","doi-asserted-by":"crossref","unstructured":"Chao, Y.W.: DexYCB: a benchmark for capturing hand grasping of objects. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 9044\u20139053 (2021)","DOI":"10.1109\/CVPR46437.2021.00893"},{"key":"19_CR3","doi-asserted-by":"crossref","unstructured":"Ohkawa, T., He, K., Sener, F., Hodan, T., Tran, L., Keskin, C.: AssemblyHands: towards egocentric activity understanding via 3D hand pose estimation. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 12999\u201313008 (2023)","DOI":"10.1109\/CVPR52729.2023.01249"},{"key":"19_CR4","doi-asserted-by":"crossref","unstructured":"Lin, F., Wilhelm, C., Martinez, T.: Two-hand global 3D pose estimation using monocular RGB. In: IEEE, pp. 2373\u20132381 (2021)","DOI":"10.1109\/WACV48630.2021.00242"},{"key":"19_CR5","unstructured":"Moon, G., et al.: A dataset of relighted 3D interacting hands. Adv. Neural Inf. Process. Syst. 36 (2023)"},{"key":"19_CR6","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., et al.: Microsoft COCO: common objects in context. In: ECCV, pp. 740\u2013755 (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"19_CR7","doi-asserted-by":"crossref","unstructured":"Moon, G., Yu, S.I., Wen, H., Shiratori, T., Lee, K.M.: InterHand2.6M: a dataset and baseline for 3D interacting hand pose estimation from a single RGB image. In: ECCV, pp. 548\u2013564 (2020)","DOI":"10.1007\/978-3-030-58565-5_33"},{"key":"19_CR8","doi-asserted-by":"crossref","unstructured":"Rong, Y., Wang, J., Liu, Z., Loy, C.C.: Monocular 3D reconstruction of interacting hands via collision-aware factorized refinements. In: 3DV, pp. 432\u2013441 (2021)","DOI":"10.1109\/3DV53792.2021.00053"},{"key":"19_CR9","doi-asserted-by":"crossref","unstructured":"Zhang, B., et al.: Interacting two-hand 3D pose and shape reconstruction from single color image. In: ICCV, pp. 11354\u201311363 (2021)","DOI":"10.1109\/ICCV48922.2021.01116"},{"key":"19_CR10","doi-asserted-by":"crossref","unstructured":"Li, M., et al.: Interacting attention graph for single image two-hand reconstruction. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 2761\u20132770 (2022)","DOI":"10.1109\/CVPR52688.2022.00278"},{"key":"19_CR11","doi-asserted-by":"crossref","unstructured":"Hampali, S., Sarkar, S.D., Rad, M., Lepetit, V.: Keypoint Transformer: solving joint identification in challenging hands and object interactions for accurate 3D pose estimation. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 11090\u201311100 (2022)","DOI":"10.1109\/CVPR52688.2022.01081"},{"key":"19_CR12","doi-asserted-by":"crossref","unstructured":"Meng, H., et al.: 3D interacting hand pose estimation by hand de-occlusion and removal. In: ECCV, pp. 380\u2013397 (2022)","DOI":"10.1007\/978-3-031-20068-7_22"},{"key":"19_CR13","doi-asserted-by":"crossref","unstructured":"Ren, P., et al.: Decoupled iterative refinement framework for interacting hands reconstruction from a single RGB image. In: ICCV, pp. 8014\u20138025 (2023)","DOI":"10.1109\/ICCV51070.2023.00736"},{"key":"19_CR14","doi-asserted-by":"crossref","unstructured":"Zuo, B., Zhao, Z., Sun, W., Xie, W., Xue, Z., Wang, Y.: Reconstructing interacting hands with interaction prior from monocular images. In: ICCV, pp. 9054\u20139064 (2023)","DOI":"10.1109\/ICCV51070.2023.00831"},{"key":"19_CR15","doi-asserted-by":"crossref","unstructured":"Li, L., et al.: RenderIH: a large-scale synthetic dataset for 3D interacting hand pose estimation. In: ICCV, pp. 20395\u201320405 (2023)","DOI":"10.1109\/ICCV51070.2023.01865"},{"key":"19_CR16","doi-asserted-by":"crossref","unstructured":"Moon, G.: Bringing inputs to shared domains for 3D interacting hands recovery in the wild. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 17028\u201317037 (2023)","DOI":"10.1109\/CVPR52729.2023.01633"},{"key":"19_CR17","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"19_CR18","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: ICCV, pp. 3836\u20133847 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"19_CR19","doi-asserted-by":"crossref","unstructured":"Mou, C., et al.: T2I-Adapter: learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:2302.08453 (2024)","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"19_CR20","unstructured":"Zhao, S., et al.: Uni-ControlNet: all-in-one control to text-to-image diffusion models. Adv. Neural Inf. Process. Syst. 36 (2023)"},{"key":"19_CR21","unstructured":"Podell, D., et al.: SDXL: improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)"},{"key":"19_CR22","doi-asserted-by":"crossref","unstructured":"Mueller, F., et al.: GANerated hands for real-time 3D hand tracking from monocular RGB. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 49\u201359 (2018)","DOI":"10.1109\/CVPR.2018.00013"},{"key":"19_CR23","doi-asserted-by":"crossref","unstructured":"Tang, H., Wang, W., Xu, D., Yan, Y., Sebe, N.: GestureGAN for hand gesture-to-gesture translation in the wild. In: ACM International Multimedia Conference, pp. 774\u2013782 (2018)","DOI":"10.1145\/3240508.3240704"},{"key":"19_CR24","doi-asserted-by":"crossref","unstructured":"Hu, H., Wang, W., Zhou, W., Zhao, W., Li, H.: Model-aware gesture-to-gesture translation. In: Conference on Computer Vision and Pattern Recognition, pp. 16428\u201316437 (2021)","DOI":"10.1109\/CVPR46437.2021.01616"},{"key":"19_CR25","first-page":"23805","volume":"35","author":"H Hu","year":"2022","unstructured":"Hu, H., Wang, W., Zhou, W., Li, H.: Hand-object interaction image generation. Adv. Neural Inf. Process. Syst. 35, 23805\u201323817 (2022)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"19_CR26","first-page":"2672","volume":"27","author":"I Goodfellow","year":"2014","unstructured":"Goodfellow, I., et al.: Generative adversarial nets. Adv. Neural Inform. Process. Syst. 27, 2672\u20132680 (2014)","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"19_CR27","unstructured":"Li, L., Zhuo, L.A., Zhang, B., Bo, L., Chen, C.: DiffHand: end-to-end hand mesh reconstruction via diffusion models. arXiv preprint arXiv:2305.13705 (2023)"},{"key":"19_CR28","unstructured":"Lin, P., et al.: HandDiffuse: generative controllers for two-hand interactions via diffusion models. arXiv preprint arXiv:2312.04867 (2023)"},{"key":"19_CR29","doi-asserted-by":"crossref","unstructured":"Lu, W., Xu, Y., Zhang, J., Wang, C., Tao, D.: HandRefiner: refining malformed hands in generated images by diffusion-based conditional inpainting. arXiv preprint arXiv:2311.17957 (2023 )","DOI":"10.1145\/3664647.3680693"},{"key":"19_CR30","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 12873\u201312883 (2021)","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"19_CR31","unstructured":"Radford, A., et al: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763 (2021)"},{"key":"19_CR32","doi-asserted-by":"crossref","unstructured":"Chen, C.F.R., Fan, Q., Panda, R.: CrossViT: cross-attention multi-scale vision transformer for image classification. In: ICCV, pp. 357\u2013366 (2021)","DOI":"10.1109\/ICCV48922.2021.00041"},{"key":"19_CR33","doi-asserted-by":"crossref","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: MICCAI, pp. 234\u2013241 (2015)","DOI":"10.1007\/978-3-319-24574-4_28"},{"issue":"1","key":"19_CR34","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s40537-022-00561-y","volume":"9","author":"A Chiche","year":"2022","unstructured":"Chiche, A., Yitagesu, B.: Part of speech tagging: a systematic review of deep learning and machine learning approaches. J. Big Data 9(1), 1\u201325 (2022)","journal-title":"J. Big Data"},{"key":"19_CR35","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"19_CR36","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2020)"},{"key":"19_CR37","doi-asserted-by":"publisher","first-page":"172","DOI":"10.1007\/s11263-016-0895-4","volume":"118","author":"D Tzionas","year":"2016","unstructured":"Tzionas, D., Ballan, L., Srikantha, A., Aponte, P., Pollefeys, M., Gall, J.: Capturing hands in action using discriminative salient points and physics simulation. Int. J. Comput. Vis. 118, 172\u2013193 (2016)","journal-title":"Int. J. Comput. Vis."},{"key":"19_CR38","first-page":"6626","volume":"30","author":"M Heusel","year":"2017","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local Nash equilibrium. Adv. Neural Inf. Process. Syst. 30, 6626\u20136637 (2017)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"19_CR39","unstructured":"Bi\u0144kowski, M., Sutherland, D., Arbel, M, Gretton, A.: Demystifying MMD GANs. In: International Conference Learning Representation (2018)"},{"key":"19_CR40","doi-asserted-by":"crossref","unstructured":"Narasimhaswamy, S., Bhattacharya, U., Chen, X., Dasgupta, I., Mitra, S., Hoai, M.: HanDiffuser: text-to-image generation with realistic hand appearances. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 2468\u20132479 (2024)","DOI":"10.1109\/CVPR52733.2024.00239"},{"key":"19_CR41","unstructured":"Zhou, Y., et al.: Mixture-of-experts with expert choice routing. Adv. Neural Inf. Process. Syst. 35, 7103\u20137114 (2022)"},{"issue":"120","key":"19_CR42","first-page":"1","volume":"23","author":"W Fedus","year":"2022","unstructured":"Fedus, W., Zoph, B., Shazeer, N.: Switch Transformers: scaling to trillion parameter models with simple and efficient sparsity. J. Mach. Learn. Res. 23(120), 1\u201339 (2022)","journal-title":"J. Mach. Learn. Res."},{"issue":"4","key":"19_CR43","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3592116","volume":"42","author":"H Chefer","year":"2023","unstructured":"Chefer, H., Alaluf, Y., Vinker, Y., Wolf, L., Cohen-Or, D.: Attend-and-excite: attention-based semantic guidance for text-to-image diffusion models. ACM Trans. Graph. 42(4), 1\u201310 (2023)","journal-title":"ACM Trans. Graph."},{"key":"19_CR44","unstructured":"Van der Maaten, L., Hinton, G.: Visualizing data using t-SNE. J. Mach. Learn. Res. 9(86), 2579\u20132605 (2008)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73027-6_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,25]],"date-time":"2024-11-25T18:15:32Z","timestamp":1732558532000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73027-6_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,26]]},"ISBN":["9783031730269","9783031730276"],"references-count":44,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73027-6_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,26]]},"assertion":[{"value":"26 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}