{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,7]],"date-time":"2025-06-07T04:03:20Z","timestamp":1749269000478,"version":"3.41.0"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031915772","type":"print"},{"value":"9783031915789","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-91578-9_4","type":"book-chapter","created":{"date-parts":[[2025,6,6]],"date-time":"2025-06-06T09:23:09Z","timestamp":1749201789000},"page":"68-84","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Adaptive Multi-modal Control of\u00a0Digital Human Hand Synthesis Using a\u00a0Region-Aware Cycle Loss"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3505-9865","authenticated-orcid":false,"given":"Qifan","family":"Fu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8103-5664","authenticated-orcid":false,"given":"Xiaohang","family":"Yang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3672-2414","authenticated-orcid":false,"given":"Muhammad","family":"Asad","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6522-2451","authenticated-orcid":false,"given":"Changjae","family":"Oh","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6918-8588","authenticated-orcid":false,"given":"Shanxin","family":"Yuan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4060-5226","authenticated-orcid":false,"given":"Gregory","family":"Slabaugh","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,12]]},"reference":[{"key":"4_CR1","unstructured":"Arkhipkin, V., Shaheen, Z., Vasilev, V., Dakhova, E., Kuznetsov, A., Dimitrov, D.: FusionFrames: efficient architectural aspects for text-to-video generation pipeline. arXiv preprint arXiv:2311.13073 (2023)"},{"key":"4_CR2","doi-asserted-by":"crossref","unstructured":"Avrahami, O., et al.: The chosen one: consistent characters in text-to-image diffusion models. arXiv preprint arXiv:2311.10093 (2023)","DOI":"10.1145\/3641519.3657430"},{"key":"4_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"455","DOI":"10.1007\/978-3-030-58580-8_27","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y Cai","year":"2020","unstructured":"Cai, Y., et al.: Learning delicate local representations for multi-person pose estimation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12348, pp. 455\u2013472. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58580-8_27"},{"key":"4_CR4","unstructured":"Cao, Z., Hidalgo Martinez, G., Simon, T., Wei, S., Sheikh, Y.A.: OpenPose: realtime multi-person 2D pose estimation using part affinity fields. IEEE Trans. Pattern Anal. Mach. Intell. (2019)"},{"key":"4_CR5","doi-asserted-by":"crossref","unstructured":"Chan, C., Ginosar, S., Zhou, T., Efros, A.A.: Everybody dance now. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5933\u20135942 (2019)","DOI":"10.1109\/ICCV.2019.00603"},{"key":"4_CR6","doi-asserted-by":"crossref","unstructured":"Duarte, A., et al.: How2Sign: a large-scale multimodal dataset for continuous American sign language. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2735\u20132744 (2021)","DOI":"10.1109\/CVPR46437.2021.00276"},{"key":"4_CR7","doi-asserted-by":"crossref","unstructured":"Eftekhar, A., Sax, A., Malik, J., Zamir, A.: OmniData: a scalable pipeline for making multi-task mid-level vision datasets from 3D scans. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10786\u201310796 (2021)","DOI":"10.1109\/ICCV48922.2021.01061"},{"key":"4_CR8","unstructured":"Feng, M., et\u00a0al.: DreaMoving: a human video generation framework based on diffusion models. arXiv e-prints pp. arXiv\u20132312 (2023)"},{"key":"4_CR9","doi-asserted-by":"crossref","unstructured":"Guo, Y., Yang, C., Rao, A., Agrawala, M., Lin, D., Dai, B.: SparseCtrl: adding sparse controls to text-to-video diffusion models. arXiv preprint arXiv:2311.16933 (2023)","DOI":"10.1007\/978-3-031-72946-1_19"},{"key":"4_CR10","unstructured":"Hu, L., Gao, X., Zhang, P., Sun, K., Zhang, B., Bo, L.: Animate anyone: consistent and controllable image-to-video synthesis for character animation. arXiv preprint arXiv:2311.17117 (2023)"},{"key":"4_CR11","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)"},{"key":"4_CR12","unstructured":"Lin, H., Cho, J., Zala, A., Bansal, M.: Ctrl-adapter: an efficient and versatile framework for adapting diverse controls to any diffusion model (2024)"},{"key":"4_CR13","doi-asserted-by":"crossref","unstructured":"Lin, J., Zeng, A., Wang, H., Zhang, L., Li, Y.: One-stage 3D whole-body mesh recovery with component aware transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21159\u201321168 (2023)","DOI":"10.1109\/CVPR52729.2023.02027"},{"issue":"9","key":"4_CR14","first-page":"5114","volume":"44","author":"W Liu","year":"2021","unstructured":"Liu, W., Piao, Z., Tu, Z., Luo, W., Ma, L., Gao, S.: Liquid warping GAN with attention: a unified framework for human image synthesis. IEEE Trans. Pattern Anal. Mach. Intell. 44(9), 5114\u20135132 (2021)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"4_CR15","unstructured":"Liu, X., et al.: HyperHuman: hyper-realistic human generation with latent structural diffusion. arXiv preprint arXiv:2310.08579 (2023)"},{"key":"4_CR16","doi-asserted-by":"crossref","unstructured":"Narasimhaswamy, S., Bhattacharya, U., Chen, X., Dasgupta, I., Mitra, S., Hoai, M.: HanDiffuser: text-to-image generation with realistic hand appearances. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.00239"},{"key":"4_CR17","unstructured":"Pham, T.X., Kang, Z., Yoo, C.D.: Cross-view masked diffusion transformers for person image synthesis. arXiv preprint arXiv:2402.01516 (2024)"},{"key":"4_CR18","unstructured":"Podell, D., et al.: SDXL: improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)"},{"key":"4_CR19","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with CLIP latents. arXiv preprint arXiv:2204.06125, vol. 1, no. 2, p. 3 (2022)"},{"key":"4_CR20","unstructured":"Ren, W., et al.: ConsistI2V: enhancing visual consistency for image-to-video generation. arXiv preprint arXiv:2402.04324 (2024)"},{"key":"4_CR21","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"4_CR22","unstructured":"Saunders, B., Camgoz, N.C., Bowden, R.: Everybody sign now: translating spoken language to photo realistic sign language video. arXiv preprint arXiv:2011.09846 (2020)"},{"key":"4_CR23","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"687","DOI":"10.1007\/978-3-030-58621-8_40","volume-title":"Computer Vision \u2013 ECCV 2020","author":"B Saunders","year":"2020","unstructured":"Saunders, B., Camgoz, N.C., Bowden, R.: Progressive transformers for end-to-end sign language production. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020, Part XI. LNCS, vol. 12356, pp. 687\u2013705. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58621-8_40"},{"issue":"7","key":"4_CR24","doi-asserted-by":"publisher","first-page":"2113","DOI":"10.1007\/s11263-021-01457-9","volume":"129","author":"B Saunders","year":"2021","unstructured":"Saunders, B., Camgoz, N.C., Bowden, R.: Continuous 3D multi-channel sign language production via progressive transformers and mixture density networks. Int. J. Comput. Vision 129(7), 2113\u20132135 (2021)","journal-title":"Int. J. Comput. Vision"},{"key":"4_CR25","doi-asserted-by":"crossref","unstructured":"Saunders, B., Camgoz, N.C., Bowden, R.: Signing at scale: learning to co-articulate signs for large-scale photo-realistic sign language production. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5141\u20135151 (2022)","DOI":"10.1109\/CVPR52688.2022.00508"},{"key":"4_CR26","doi-asserted-by":"crossref","unstructured":"Shi, X., et\u00a0al.: Motion-I2V: consistent and controllable image-to-video generation with explicit motion modeling. arXiv preprint arXiv:2401.15977 (2024)","DOI":"10.1145\/3641519.3657497"},{"key":"4_CR27","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)"},{"key":"4_CR28","unstructured":"Wang, W., et\u00a0al.: MagicVideo-V2: multi-stage high-aesthetic video generation. arXiv preprint arXiv:2401.04468 (2024)"},{"key":"4_CR29","unstructured":"Xie, P., Zhang, Q., Li, Z., Tang, H., Du, Y., Hu, X.: Vector quantized diffusion model with CodeUnet for text-to-sign pose sequences generation. arXiv preprint arXiv:2208.09141 (2022)"},{"key":"4_CR30","unstructured":"Xu, Z., Wei, K., Yang, X., Deng, C.: Do you guys want to dance: zero-shot compositional human dance generation with multiple persons. arXiv preprint arXiv:2401.13363 (2024)"},{"key":"4_CR31","doi-asserted-by":"crossref","unstructured":"Yang, Z., Zeng, A., Yuan, C., Li, Y.: Effective whole-body pose estimation with two-stages distillation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4210\u20134220 (2023)","DOI":"10.1109\/ICCVW60793.2023.00455"},{"key":"4_CR32","doi-asserted-by":"crossref","unstructured":"Zhang, F., Zhu, X., Dai, H., Ye, M., Zhu, C.: Distribution-aware coordinate representation for human pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7093\u20137102 (2020)","DOI":"10.1109\/CVPR42600.2020.00712"},{"key":"4_CR33","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3836\u20133847 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"4_CR34","doi-asserted-by":"crossref","unstructured":"Zhu, S., et al.: Champ: controllable and consistent human image animation with 3D parametric guidance (2024)","DOI":"10.1007\/978-3-031-73001-6_9"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-91578-9_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,6]],"date-time":"2025-06-06T09:23:18Z","timestamp":1749201798000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-91578-9_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031915772","9783031915789"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-91578-9_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"12 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}