{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:10:01Z","timestamp":1778080201342,"version":"3.51.4"},"publisher-location":"Cham","reference-count":71,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031732089","type":"print"},{"value":"9783031732096","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73209-6_4","type":"book-chapter","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T15:02:57Z","timestamp":1730386977000},"page":"55-73","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["PoseEmbroider: Towards a\u00a03D, Visual, Semantic-Aware Human Pose Representation"],"prefix":"10.1007","author":[{"given":"Ginger","family":"Delmas","sequence":"first","affiliation":[]},{"given":"Philippe","family":"Weinzaepfel","sequence":"additional","affiliation":[]},{"given":"Francesc","family":"Moreno-Noguer","sequence":"additional","affiliation":[]},{"given":"Gr\u00e9gory","family":"Rogez","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,1]]},"reference":[{"key":"4_CR1","doi-asserted-by":"crossref","unstructured":"Ahuja, C., Morency, L.P.: Language2pose: natural language grounded pose forecasting. In: 3DV (2019)","DOI":"10.1109\/3DV.2019.00084"},{"key":"4_CR2","unstructured":"Akbari, H., et al.: Vatt: transformers for multimodal self-supervised learning from raw video, audio and text. In: NeurIPS (2021)"},{"key":"4_CR3","unstructured":"Alayrac, J.B., et al.: Self-supervised multimodal versatile networks. In: NeurIPS (2020)"},{"key":"4_CR4","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: ACL Workshop (2005)"},{"key":"4_CR5","doi-asserted-by":"crossref","unstructured":"Barsoum, E., Kender, J., Liu, Z.: HP-GAN: probabilistic 3D human motion prediction via GAN. In: CVPRW (2018)","DOI":"10.1109\/CVPRW.2018.00191"},{"key":"4_CR6","doi-asserted-by":"crossref","unstructured":"Black, M.J., Patel, P., Tesch, J., Yang, J.: BEDLAM: a synthetic dataset of bodies exhibiting detailed lifelike animated motion. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00843"},{"key":"4_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"561","DOI":"10.1007\/978-3-319-46454-1_34","volume-title":"Computer Vision \u2013 ECCV 2016","author":"F Bogo","year":"2016","unstructured":"Bogo, F., Kanazawa, A., Lassner, C., Gehler, P., Romero, J., Black, M.J.: Keep It SMPL: automatic estimation of 3D human pose and shape from a single image. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9909, pp. 561\u2013578. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_34"},{"key":"4_CR8","doi-asserted-by":"crossref","unstructured":"Brooks, T., Holynski, A., Efros, A.A.: Instructpix2pix: learning to follow image editing instructions. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"4_CR9","unstructured":"Cai, Z., et\u00a0al.: Smpler-x: scaling up expressive human pose and shape estimation. In: NeurIPS (2024)"},{"key":"4_CR10","doi-asserted-by":"crossref","unstructured":"Chen, W., et al.: Beyond appearance: a semantic controllable self-supervised learning framework for human-centric visual tasks. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01445"},{"key":"4_CR11","unstructured":"Chen, Z., Li, Q., Wang, X., Yang, W.: LiftedCL: lifting contrastive learning for human-centric perception. In: ICLR (2022)"},{"key":"4_CR12","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* chatgpt quality (2023). https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"4_CR13","doi-asserted-by":"crossref","unstructured":"Ci, Y., et al.: UniHCP: a unified model for human-centric perceptions. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01711"},{"key":"4_CR14","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"346","DOI":"10.1007\/978-3-031-20068-7_20","volume-title":"ECCV 2022","author":"G Delmas","year":"2022","unstructured":"Delmas, G., Weinzaepfel, P., Lucas, T., Moreno-Noguer, F., Rogez, G.: PoseScript: 3D human poses from natural language. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13666, pp. 346\u2013362. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20068-7_20"},{"key":"4_CR15","doi-asserted-by":"crossref","unstructured":"Delmas, G., Weinzaepfel, P., Moreno-Noguer, F., Rogez, G.: PoseFix: correcting 3D Human Poses with Natural Language. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01379"},{"key":"4_CR16","unstructured":"Ding, Y., Tian, C., Ding, H., Liu, L.: The clip model is secretly an image-to-prompt converter. In: NeurIPS (2024)"},{"key":"4_CR17","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. In: ICLR (2021)"},{"key":"4_CR18","unstructured":"Driess, D., et al.: Palm-e: an embodied multimodal language model. In: arXiv preprint arXiv:2303.03378 (2023)"},{"key":"4_CR19","unstructured":"Faghri, F., Fleet, D.J., Kiros, J.R., Fidler, S.: Vse++: improving visual-semantic embeddings with hard negatives (2018)"},{"key":"4_CR20","doi-asserted-by":"crossref","unstructured":"Feng, Y., Lin, J., Dwivedi, S.K., Sun, Y., Patel, P., Black, M.J.: Chatpose: chatting about 3D human pose. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.00204"},{"key":"4_CR21","doi-asserted-by":"crossref","unstructured":"Fieraru, M., Zanfir, M., Pirlea, S.C., Olaru, V., Sminchisescu, C.: AIFit: automatic 3D human-interpretable feedback models for fitness training. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00979"},{"key":"4_CR22","unstructured":"Frome, A., et al.: Devise: a deep visual-semantic embedding model. In: NeurIPS (2013)"},{"key":"4_CR23","doi-asserted-by":"crossref","unstructured":"Ghosh, A., Cheema, N., Oguz, C., Theobalt, C., Slusallek, P.: Synthesis of compositional animations from textual descriptions. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00143"},{"key":"4_CR24","doi-asserted-by":"crossref","unstructured":"Girdhar, R., et al.: ImageBind: one embedding space to bind them all. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"4_CR25","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Singh, M., Ravi, N., van\u00a0der Maaten, L., Joulin, A., Misra, I.: Omnivore: a single model for many visual modalities. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01563"},{"key":"4_CR26","doi-asserted-by":"crossref","unstructured":"Goel, S., Pavlakos, G., Rajasegaran, J., Kanazawa*, A., Malik*, J.: Humans in 4D: reconstructing and tracking humans with transformers. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01358"},{"key":"4_CR27","doi-asserted-by":"crossref","unstructured":"Guo, C., et al.: Generating diverse and natural 3d human motions from text. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"4_CR28","unstructured":"Haykin, S.: Neural Networks: A Comprehensive Foundation. Prentice Hall PTR (1994)"},{"key":"4_CR29","doi-asserted-by":"crossref","unstructured":"Hong, F., Pan, L., Cai, Z., Liu, Z.: Versatile multi-modal pre-training for human-centric perception. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01568"},{"key":"4_CR30","doi-asserted-by":"crossref","unstructured":"Ibrahimi, S., Sun, X., Wang, P., Garg, A., Sanan, A., Omar, M.: Audio-enhanced text-to-video retrieval using text-conditioned feature alignment. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01107"},{"key":"4_CR31","doi-asserted-by":"crossref","unstructured":"Jin, Z., Hayat, M., Yang, Y., Guo, Y., Lei, Y.: Context-aware alignment and mutual masking for 3D-language pre-training. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01057"},{"key":"4_CR32","doi-asserted-by":"crossref","unstructured":"Kanazawa, A., Black, M.J., Jacobs, D.W., Malik, J.: End-to-end recovery of human shape and pose. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00744"},{"key":"4_CR33","doi-asserted-by":"crossref","unstructured":"Kim, G., Kwon, T., Ye, J.C.: Diffusionclip: text-guided diffusion models for robust image manipulation. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00246"},{"key":"4_CR34","doi-asserted-by":"crossref","unstructured":"Kim, H., Zala, A., Burri, G., Bansal, M.: FixMyPose: Pose correctional captioning and retrieval. In: AAAI (2021)","DOI":"10.1609\/aaai.v35i14.17555"},{"key":"4_CR35","doi-asserted-by":"crossref","unstructured":"Kim, J., Kim, J., Choi, S.: Flame: Free-form language-based motion synthesis & editing. In: AAAI (2023)","DOI":"10.1609\/aaai.v37i7.25996"},{"key":"4_CR36","unstructured":"Kiros, R., Salakhutdinov, R., Zemel, R.S.: Unifying visual-semantic embeddings with multimodal neural language models. arXiv preprint arXiv:1411.2539 (2014)"},{"key":"4_CR37","doi-asserted-by":"crossref","unstructured":"Kolotouros, N., Pavlakos, G., Black, M.J., Daniilidis, K.: Learning to reconstruct 3D human pose and shape via model-fitting in the loop. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00234"},{"key":"4_CR38","unstructured":"Kwon, G., Cai, Z., Ravichandran, A., Bas, E., Bhotika, R., Soatto, S.: Masked vision and language modeling for multi-modal representation learning. In: ICLR (2023)"},{"key":"4_CR39","unstructured":"Lin, C.Y.: Rouge: a package for automatic evaluation of summaries. In: Text Summarization Branches Out (2004)"},{"key":"4_CR40","unstructured":"Lin, J., et al.: Motion-x: a large-scale 3D expressive whole-body human motion dataset. In: NeurIPS (2023)"},{"key":"4_CR41","doi-asserted-by":"crossref","unstructured":"Lin, J., Zeng, A., Wang, H., Zhang, L., Li, Y.: One-stage 3D whole-body mesh recovery with component aware transformer. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02027"},{"key":"4_CR42","doi-asserted-by":"crossref","unstructured":"Lin, Y., Wei, C., Wang, H., Yuille, A., Xie, C.: Smaug: sparse masked autoencoder for efficient video-language pre-training. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00233"},{"key":"4_CR43","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeurIPS (2023)"},{"key":"4_CR44","doi-asserted-by":"crossref","unstructured":"Loper, M., Mahmood, N., Romero, J., Pons-Moll, G., Black, M.J.: SMPL: a skinned multi-person linear model. ACM TOG (2015)","DOI":"10.1145\/2816795.2818013"},{"key":"4_CR45","unstructured":"Lucas, T., Baradel, F., Weinzaepfel, P., Rogez, G.: PoseGPT: quantizing human motion for large scale generative modeling. In: ECCV (2022)"},{"key":"4_CR46","doi-asserted-by":"crossref","unstructured":"Mahmood, N., Ghorbani, N., Troje, N.F., Pons-Moll, G., Black, M.J.: AMASS: archive of motion capture as surface shapes. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00554"},{"key":"4_CR47","doi-asserted-by":"crossref","unstructured":"von Marcard, T., Henschel, R., Black, M., Rosenhahn, B., Pons-Moll, G.: Recovering accurate 3D human pose in the wild using IMUs and a moving camera. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01249-6_37"},{"key":"4_CR48","unstructured":"Mizrahi, D., et al.: 4m: massively multimodal masked modeling. In: NeurIPS (2024)"},{"key":"4_CR49","doi-asserted-by":"crossref","unstructured":"M\u00fcller, L., Osman, A.A.A., Tang, S., Huang, C.H.P., Black, M.J.: On self-contact and human pose. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00986"},{"key":"4_CR50","doi-asserted-by":"crossref","unstructured":"M\u00fcller, M., Arzt, A., Balke, S., Dorfer, M., Widmer, G.: Cross-modal music retrieval and applications: an overview of key methodologies. IEEE Signal Process. Mag. (2018)","DOI":"10.1109\/MSP.2018.2868887"},{"key":"4_CR51","unstructured":"Oord, A.V.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"4_CR52","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., jing Zhu, W.: Bleu: a method for automatic evaluation of machine translation. In: ACL (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"4_CR53","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., et al.: Expressive body capture: 3D hands, face, and body from a single image. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.01123"},{"key":"4_CR54","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"480","DOI":"10.1007\/978-3-031-20047-2_28","volume-title":"ECCV 2022","author":"M Petrovich","year":"2022","unstructured":"Petrovich, M., Black, M.J., Varol, G.: TEMOS: generating diverse human motions from textual descriptions. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13682, pp. 480\u2013497. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20047-2_28"},{"key":"4_CR55","doi-asserted-by":"crossref","unstructured":"Petrovich, M., Black, M.J., Varol, G.: TMR: text-to-motion retrieval using contrastive 3D human motion synthesis. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00870"},{"key":"4_CR56","doi-asserted-by":"crossref","unstructured":"Plappert, M., Mandery, C., Asfour, T.: The kit motion-language dataset. Big Data (2016)","DOI":"10.1089\/big.2016.0028"},{"key":"4_CR57","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"4_CR58","doi-asserted-by":"crossref","unstructured":"Ruan, Y., Lee, H.H., Zhang, Y., Zhang, K., Chang, A.X.: TriCOLo: trimodal contrastive loss for text to shape retrieval. In: WACV (2024)","DOI":"10.1109\/WACV57701.2024.00571"},{"key":"4_CR59","unstructured":"Sanh, V., Debut, L., Chaumond, J., Wolf, T.: Distilbert, a distilled version of bert: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)"},{"key":"4_CR60","doi-asserted-by":"crossref","unstructured":"Tang, S., et\u00a0al.: Humanbench: towards general human-centric perception with projector assisted pretraining. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02104"},{"key":"4_CR61","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"358","DOI":"10.1007\/978-3-031-20047-2_21","volume-title":"ECCV 2022","author":"G Tevet","year":"2022","unstructured":"Tevet, G., Gordon, B., Hertz, A., Bermano, A.H., Cohen-Or, D.: MotionCLIP: exposing human motion generation to clip space. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13682, pp. 358\u2013374. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20047-2_21"},{"key":"4_CR62","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS (2017)"},{"key":"4_CR63","doi-asserted-by":"crossref","unstructured":"Vo, N., et al.: Composing text and image for image retrieval-an empirical odyssey. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00660"},{"key":"4_CR64","doi-asserted-by":"crossref","unstructured":"Von\u00a0Marcard, T., Henschel, R., Black, M.J., Rosenhahn, B., Pons-Moll, G.: Recovering accurate 3D human pose in the wild using IMUs and a moving camera. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01249-6_37"},{"key":"4_CR65","unstructured":"Wang, Y., et\u00a0al.: Hulk: a universal knowledge translator for human-centric tasks. arXiv preprint arXiv:2312.01697 (2023)"},{"key":"4_CR66","unstructured":"Wu, S., Fei, H., Qu, L., Ji, W., Chua, T.S.: Next-GPT: any-to-any multimodal LLM. arXiv preprint arXiv:2309.05519 (2023)"},{"key":"4_CR67","unstructured":"Xu, Y., Zhang, J., Zhang, Q., Tao, D.: ViTPose: simple vision transformer baselines for human pose estimation. In: NeurIPS (2022)"},{"key":"4_CR68","doi-asserted-by":"crossref","unstructured":"Yin, K., Zou, S., Ge, Y., Tian, Z.: Tri-modal motion retrieval by learning a joint embedding space. arXiv preprint arXiv:4030.0691 (2024)","DOI":"10.1109\/CVPR52733.2024.00158"},{"key":"4_CR69","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"173","DOI":"10.1007\/978-3-031-20062-5_11","volume-title":"ECCV 2022","author":"K Youwang","year":"2022","unstructured":"Youwang, K., Ji-Yeon, K., Oh, T.H.: Clip-actor: text-driven recommendation and stylization for animating human meshes. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13663, pp. 173\u2013191. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20062-5_11"},{"key":"4_CR70","unstructured":"Yuan, J., et\u00a0al.: Hap: structure-aware masked image modeling for human-centric perception. In: NeurIPS (2023)"},{"key":"4_CR71","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"180","DOI":"10.1007\/978-3-031-20068-7_11","volume-title":"ECCV 2022","author":"S Zhang","year":"2022","unstructured":"Zhang, S., et al.: EgoBody: Human body shape and motion of interacting people from head-mounted devices. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13666, pp. 180\u2013200. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20068-7_11"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73209-6_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,8]],"date-time":"2025-04-08T12:26:40Z","timestamp":1744115200000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73209-6_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,1]]},"ISBN":["9783031732089","9783031732096"],"references-count":71,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73209-6_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,1]]},"assertion":[{"value":"1 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}