{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,10]],"date-time":"2025-11-10T21:21:34Z","timestamp":1762809694958,"version":"3.40.3"},"publisher-location":"Cham","reference-count":60,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031726972"},{"type":"electronic","value":"9783031726989"}],"license":[{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72698-9_2","type":"book-chapter","created":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T04:45:57Z","timestamp":1729831557000},"page":"19-36","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["TELA: Text to\u00a0Layer-Wise 3D Clothed Human Generation"],"prefix":"10.1007","author":[{"given":"Junting","family":"Dong","sequence":"first","affiliation":[]},{"given":"Qi","family":"Fang","sequence":"additional","affiliation":[]},{"given":"Zehuan","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Xudong","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Jingbo","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Sida","family":"Peng","sequence":"additional","affiliation":[]},{"given":"Bo","family":"Dai","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,26]]},"reference":[{"doi-asserted-by":"crossref","unstructured":"Alldieck, T., Magnor, M., Xu, W., Theobalt, C., Pons-Moll, G.: Video based reconstruction of 3D people models. In: CVPR (2018)","key":"2_CR1","DOI":"10.1109\/CVPR.2018.00875"},{"doi-asserted-by":"crossref","unstructured":"Bhatnagar, B.L., Tiwari, G., Theobalt, C., Pons-Moll, G.: Multi-garment net: learning to dress 3D people from images. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5420\u20135430 (2019)","key":"2_CR2","DOI":"10.1109\/ICCV.2019.00552"},{"key":"2_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"561","DOI":"10.1007\/978-3-319-46454-1_34","volume-title":"Computer Vision \u2013 ECCV 2016","author":"F Bogo","year":"2016","unstructured":"Bogo, F., Kanazawa, A., Lassner, C., Gehler, P., Romero, J., Black, M.J.: Keep It SMPL: automatic estimation of 3D human pose and shape from a single image. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9909, pp. 561\u2013578. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_34"},{"key":"2_CR4","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"doi-asserted-by":"crossref","unstructured":"Cao, Y., Cao, Y.P., Han, K., Shan, Y., Wong, K.Y.K.: DreamAvatar: text-and-Shape guided 3D human avatar generation via diffusion models. arXiv preprint arXiv:2304.00916 (2023)","key":"2_CR5","DOI":"10.1109\/CVPR52733.2024.00097"},{"issue":"1","key":"2_CR6","first-page":"1","volume":"41","author":"X Chen","year":"2021","unstructured":"Chen, X., Pang, A., Yang, W., Wang, P., Xu, L., Yu, J.: TightCap: 3D human shape capture with clothing tightness field. ACM TOG 41(1), 1\u201317 (2021)","journal-title":"ACM TOG"},{"doi-asserted-by":"crossref","unstructured":"Chen, X., et al.: gDNA: towards generative detailed neural avatars. arXiv preprint arXiv:2201.04123 (2022)","key":"2_CR7","DOI":"10.1109\/CVPR52688.2022.01978"},{"doi-asserted-by":"crossref","unstructured":"Collet, A., et al.: High-quality streamable free-viewpoint video. ACM TOG 34(4), 1\u201313 (2015)","key":"2_CR8","DOI":"10.1145\/2766945"},{"doi-asserted-by":"crossref","unstructured":"Corona, E., Pumarola, A., Aleny\u00e0, G., Pons-Moll, G., Moreno-Noguer, F.: SMPLicit: topology-aware generative model for clothed people. In: CVPR (2021)","key":"2_CR9","DOI":"10.1109\/CVPR46437.2021.01170"},{"key":"2_CR10","first-page":"13654","volume":"35","author":"J Dong","year":"2022","unstructured":"Dong, J., et al.: TotalSelfScan: learning full-body avatars from self-portrait videos of faces, hands, and bodies. Adv. Neural. Inf. Process. Syst. 35, 13654\u201313667 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"doi-asserted-by":"crossref","unstructured":"Dong, J., Fang, Q., Yang, T., Shuai, Q., Qiao, C., Peng, S.: iVS-Net: learning human view synthesis from internet videos. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 22942\u201322951 (2023)","key":"2_CR11","DOI":"10.1109\/ICCV51070.2023.02097"},{"doi-asserted-by":"crossref","unstructured":"Feng, Y., Yang, J., Pollefeys, M., Black, M.J., Bolkart, T.: Capturing and animation of body and clothing from monocular video. In: SIGGRAPH Asia 2022 Conference Papers (2022)","key":"2_CR12","DOI":"10.1145\/3550469.3555423"},{"doi-asserted-by":"crossref","unstructured":"Guo, K., et\u00a0al.: The relightables: volumetric performance capture of humans with realistic relighting. ACM TOG 38(6), 1\u201319 (2019)","key":"2_CR13","DOI":"10.1145\/3355089.3356571"},{"issue":"4","key":"2_CR14","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3528223.3530094","volume":"41","author":"F Hong","year":"2022","unstructured":"Hong, F., Zhang, M., Pan, L., Cai, Z., Yang, L., Liu, Z.: AvatarCLIP: zero-shot text-driven generation and animation of 3D avatars. ACM TOG 41(4), 1\u201319 (2022)","journal-title":"ACM TOG"},{"unstructured":"Hu, S., et al.: HumanLiff: layer-wise 3D human generation with diffusion model. arXiv preprint arXiv:2308.09712 (2023)","key":"2_CR15"},{"doi-asserted-by":"crossref","unstructured":"Huang, X., et al.: HumanNorm: learning normal diffusion model for high-quality and realistic 3D human generation. arXiv preprint arXiv:2310.01406 (2023)","key":"2_CR16","DOI":"10.1109\/CVPR52733.2024.00437"},{"unstructured":"Huang, Y., et al.: DreamWaltz: make a scene with complex 3D animatable avatars. In: NeurIPS (2023)","key":"2_CR17"},{"doi-asserted-by":"crossref","unstructured":"Jain, A., Mildenhall, B., Barron, J.T., Abbeel, P., Poole, B.: Zero-shot text-guided object generation with dream fields (2022)","key":"2_CR18","DOI":"10.1109\/CVPR52688.2022.00094"},{"key":"2_CR19","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"18","DOI":"10.1007\/978-3-030-58565-5_2","volume-title":"Computer Vision \u2013 ECCV 2020","author":"B Jiang","year":"2020","unstructured":"Jiang, B., Zhang, J., Hong, Y., Luo, J., Liu, L., Bao, H.: BCNet: learning body and cloth shape from a single image. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) ECCV 2020. LNCS, vol. 12365, pp. 18\u201335. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58565-5_2"},{"unstructured":"Jun, H., Nichol, A.: Shap-E: generating conditional 3d implicit functions. arXiv preprint arXiv:2305.02463 (2023)","key":"2_CR20"},{"unstructured":"Khalid, N.M., Xie, T., Belilovsky, E., Tiberiu, P.: CLIP-Mesh: generating textured meshes from text using pretrained image-text models. In: SIGGRAPH Asia 2022 Conference Papers (2022)","key":"2_CR21"},{"unstructured":"Kolotouros, N., Alldieck, T., Zanfir, A., Bazavan, E.G., Fieraru, M., Sminchisescu, C.: DreamHuman: animatable 3D avatars from text. arXiv preprint arXiv:2306.09329 (2023)","key":"2_CR22"},{"unstructured":"Liao, T., et al.: TADA! text to animatable digital avatars. arXiv preprint arXiv:2308.10899 (2023)","key":"2_CR23"},{"doi-asserted-by":"crossref","unstructured":"Lin, C.H., et al.: Magic3D: high-resolution Text-to-3D content creation. arXiv preprint arXiv:2211.10440 (2022)","key":"2_CR24","DOI":"10.1109\/CVPR52729.2023.00037"},{"doi-asserted-by":"crossref","unstructured":"Liu, R., Wu, R., Hoorick, B.V., Tokmakov, P., Zakharov, S., Vondrick, C.: Zero-1-to-3: zero-shot one image to 3D object (2023)","key":"2_CR25","DOI":"10.1109\/ICCV51070.2023.00853"},{"doi-asserted-by":"crossref","unstructured":"Ma, Q., et al.: Learning to dress 3D people in generative clothing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6469\u20136478 (2020)","key":"2_CR26","DOI":"10.1109\/CVPR42600.2020.00650"},{"doi-asserted-by":"crossref","unstructured":"Mescheder, L., Oechsle, M., Niemeyer, M., Nowozin, S., Geiger, A.: Occupancy networks: learning 3D reconstruction in function space. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4460\u20134470 (2019)","key":"2_CR27","DOI":"10.1109\/CVPR.2019.00459"},{"doi-asserted-by":"crossref","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: Nerf: representing scenes as neural radiance fields for view synthesis. In: ECCV (2020)","key":"2_CR28","DOI":"10.1007\/978-3-030-58452-8_24"},{"doi-asserted-by":"crossref","unstructured":"M\u00fcller, T., Evans, A., Schied, C., Keller, A.: Instant neural graphics primitives with a multiresolution hash encoding. ACM TOG 41(4), 1\u201315 (2022)","key":"2_CR29","DOI":"10.1145\/3528223.3530127"},{"unstructured":"Nichol, A., Jun, H., Dhariwal, P., Mishkin, P., Chen, M.: Point-E: a system for generating 3D point clouds from complex prompts. arXiv preprint arXiv:2212.08751 (2022)","key":"2_CR30"},{"doi-asserted-by":"crossref","unstructured":"Park, J.J., Florence, P., Straub, J., Newcombe, R., Lovegrove, S.: DeepSDF: learning continuous signed distance functions for shape representation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 165\u2013174 (2019)","key":"2_CR31","DOI":"10.1109\/CVPR.2019.00025"},{"doi-asserted-by":"crossref","unstructured":"Pavlakos, G., et al.: Expressive body capture: 3D hands, face, and body from a single image. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10975\u201310985 (2019)","key":"2_CR32","DOI":"10.1109\/CVPR.2019.01123"},{"doi-asserted-by":"crossref","unstructured":"Peng, S., et al.: Animatable neural radiance fields for modeling dynamic human bodies. In: ICCV (2021)","key":"2_CR33","DOI":"10.1109\/ICCV48922.2021.01405"},{"doi-asserted-by":"crossref","unstructured":"Peng, S.,et al.: Animatable implicit neural representations for creating realistic avatars from videos. IEEE Trans. Pattern Anal. Mach. Intell. 46(6), 4147\u20134159 (2024)","key":"2_CR34","DOI":"10.1109\/TPAMI.2024.3355287"},{"doi-asserted-by":"crossref","unstructured":"Peng, S., et al.: Neural body: Implicit neural representations with structured latent codes for novel view synthesis of dynamic humans. In: CVPR (2021)","key":"2_CR35","DOI":"10.1109\/CVPR46437.2021.00894"},{"doi-asserted-by":"crossref","unstructured":"Pons-Moll, G., Pujades, S., Hu, S., Black, M.J.: ClothCap: seamless 4D clothing capture and retargeting. ACM Trans. Graph. 36, 73:1\u201373:15 (2017)","key":"2_CR36","DOI":"10.1145\/3072959.3073711"},{"unstructured":"Poole, B., Jain, A., Barron, J.T., Mildenhall, B.: DreamFusion: text-to-3D using 2D diffusion. In: ICLR (2023)","key":"2_CR37"},{"unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML (2021)","key":"2_CR38"},{"unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(140), 1\u201367 (2020). http:\/\/jmlr.org\/papers\/v21\/20-074.html","key":"2_CR39"},{"unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with CLIP latents. arXiv preprint arXiv:2204.06125 (2022)","key":"2_CR40"},{"unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with CLIP latents. arXiv preprint arXiv:2204.06125 (2022)","key":"2_CR41"},{"unstructured":"Ranade, S., et al.: SSDNeRF: semantic soft decomposition of neural radiance fields. arXiv preprint arXiv:2212.03406 (2022)","key":"2_CR42"},{"doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","key":"2_CR43","DOI":"10.1109\/CVPR52688.2022.01042"},{"unstructured":"Saharia, C., et\u00a0al.: Photorealistic text-to-image diffusion models with deep language understanding. arXiv preprint arXiv:2205.11487 (2022)","key":"2_CR44"},{"doi-asserted-by":"crossref","unstructured":"Saharia, C., Ho, J., Chan, W., Salimans, T., Fleet, D.J., Norouzi, M.: Image super-resolution via iterative refinement. arXiv preprint arXiv:2104.07636 (2021)","key":"2_CR45","DOI":"10.1109\/TPAMI.2022.3204461"},{"doi-asserted-by":"crossref","unstructured":"Saito, S., Huang, Z., Natsume, R., Morishima, S., Kanazawa, A., Li, H.: PIFu: pixel-aligned implicit function for high-resolution clothed human digitization. In: ICCV (2019)","key":"2_CR46","DOI":"10.1109\/ICCV.2019.00239"},{"doi-asserted-by":"crossref","unstructured":"Saito, S., Yang, J., Ma, Q., Black, M.J.: SCANimate: weakly supervised learning of skinned clothed avatar networks. In: CVPR (2021)","key":"2_CR47","DOI":"10.1109\/CVPR46437.2021.00291"},{"unstructured":"Schuhmann, C., et\u00a0al.: LAION-5B: an open large-scale dataset for training next generation image-text models. arXiv preprint arXiv:2210.08402 (2022)","key":"2_CR48"},{"doi-asserted-by":"crossref","unstructured":"Tiwari, G., Sarafianos, N., Tung, T., Pons-Moll, G.: Neural-GIF: neural generalized implicit functions for animating people in clothing. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11708\u201311718 (2021)","key":"2_CR49","DOI":"10.1109\/ICCV48922.2021.01150"},{"doi-asserted-by":"crossref","unstructured":"Wang, J., et al.: Disentangled clothed avatar generation from text descriptions. arXiv preprint arXiv:2312.05295 (2023)","key":"2_CR50","DOI":"10.1007\/978-3-031-72943-0_22"},{"unstructured":"Wang, S., Mihajlovic, M., Ma, Q., Geiger, A., Tang, S.: MetaAvatar: learning animatable clothed human models from few depth images. In: Advances in Neural Information Processing Systems (2021)","key":"2_CR51"},{"unstructured":"Wang, Z., et al.: ProlificDreamer: high-fidelity and diverse text-to-3D generation with variational score distillation. arXiv preprint arXiv:2305.16213 (2023)","key":"2_CR52"},{"unstructured":"Wu, J., Li, S., Ji, S., Wang, Y., Xiong, R., Liao, Y.: DORec: decomposed object reconstruction utilizing 2D self-supervised features. arXiv preprint arXiv:2310.11092 (2023)","key":"2_CR53"},{"doi-asserted-by":"crossref","unstructured":"Xiu, Y., Yang, J., Tzionas, D., Black, M.J.: ICON: implicit clothed humans obtained from normals. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 13286\u201313296. IEEE (2022)","key":"2_CR54","DOI":"10.1109\/CVPR52688.2022.01294"},{"doi-asserted-by":"crossref","unstructured":"Yu, T., et al.: DoubleFusion: real-time capture of human performances with inner body shapes from a single depth sensor. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7287\u20137296 (2018)","key":"2_CR55","DOI":"10.1109\/CVPR.2018.00761"},{"doi-asserted-by":"crossref","unstructured":"Yu, T., et al.: SimulCap : single-view human performance capture with cloth simulation. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5499\u20135509 (2019)","key":"2_CR56","DOI":"10.1109\/CVPR.2019.00565"},{"doi-asserted-by":"crossref","unstructured":"Yuan, Y., et al.: GAvatar: animatable 3D Gaussian avatars with implicit mesh learning. arXiv preprint arXiv:2312.11461 (2023)","key":"2_CR57","DOI":"10.1109\/CVPR52733.2024.00091"},{"doi-asserted-by":"crossref","unstructured":"Zhang, H., Feng, Y., Kulits, P., Wen, Y., Thies, J., Black, M.J.: Text-guided generation and editing of compositional 3D avatars. arXiv preprint arXiv:2309.07125 (2023)","key":"2_CR58","DOI":"10.1109\/3DV62453.2024.00151"},{"doi-asserted-by":"crossref","unstructured":"Zhang, H., et al.: AvatarVerse: high-quality and stable 3D avatar creation from text and pose. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a038, pp. 7124\u20137132 (2024)","key":"2_CR59","DOI":"10.1609\/aaai.v38i7.28540"},{"doi-asserted-by":"crossref","unstructured":"Zhang, L., Agrawala, M.: Adding conditional control to text-to-image diffusion models. arXiv preprint arXiv:2302.05543 (2023)","key":"2_CR60","DOI":"10.1109\/ICCV51070.2023.00355"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72698-9_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T07:20:50Z","timestamp":1732951250000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72698-9_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,26]]},"ISBN":["9783031726972","9783031726989"],"references-count":60,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72698-9_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,26]]},"assertion":[{"value":"26 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}