{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:46:48Z","timestamp":1777657608157,"version":"3.51.4"},"publisher-location":"Cham","reference-count":104,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031733826","type":"print"},{"value":"9783031733833","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73383-3_26","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T12:06:27Z","timestamp":1730549187000},"page":"445-463","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["Plan, Posture and\u00a0Go: Towards Open-Vocabulary Text-to-Motion Generation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-8689-8248","authenticated-orcid":false,"given":"Jinpeng","family":"Liu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1465-3739","authenticated-orcid":false,"given":"Wenxun","family":"Dai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9400-9107","authenticated-orcid":false,"given":"Chunyu","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1457-2328","authenticated-orcid":false,"given":"Yiji","family":"Cheng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1534-4549","authenticated-orcid":false,"given":"Yansong","family":"Tang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8788-2453","authenticated-orcid":false,"given":"Xin","family":"Tong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"key":"26_CR1","doi-asserted-by":"crossref","unstructured":"Ahn, H., Ha, T., Choi, Y., Yoo, H., Oh, S.: Text2Action: generative adversarial synthesis from language to action. In: ICRA, pp. 5915\u20135920 (2018)","DOI":"10.1109\/ICRA.2018.8460608"},{"key":"26_CR2","unstructured":"Ahn, M., et\u00a0al.: Do as i can, not as i say: grounding language in robotic affordances. arXiv preprint arXiv:2204.01691 (2022)"},{"key":"26_CR3","doi-asserted-by":"crossref","unstructured":"Aksan, E., Kaufmann, M., Cao, P., Hilliges, O.: A spatio-temporal transformer for 3D human motion prediction. In: 3DV, pp. 565\u2013574 (2021)","DOI":"10.1109\/3DV53792.2021.00066"},{"key":"26_CR4","doi-asserted-by":"crossref","unstructured":"Athanasiou, N., Petrovich, M., Black, M.J., Varol, G.: Teach: temporal action composition for 3D humans. In: 3DV, pp. 414\u2013423 (2022)","DOI":"10.1109\/3DV57658.2022.00053"},{"key":"26_CR5","doi-asserted-by":"crossref","unstructured":"Barsoum, E., Kender, J., Liu, Z.: HP-GAN: probabilistic 3D human motion prediction via GAN. In: CVPRW, pp. 1418\u20131427 (2018)","DOI":"10.1109\/CVPRW.2018.00191"},{"key":"26_CR6","doi-asserted-by":"crossref","unstructured":"Brooks, T., Holynski, A., Efros, A.A.: InstructPix2Pix: learning to follow image editing instructions. In: CVPR, pp. 18392\u201318402 (2023)","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"26_CR7","unstructured":"Brown, T., et\u00a0al.: Language models are few-shot learners. NIPS, pp. 1877\u20131901 (2020)"},{"key":"26_CR8","unstructured":"Bubeck, S., et\u00a0al.: Sparks of artificial general intelligence: early experiments with GPT-4. arXiv preprint arXiv:2303.12712 (2023)"},{"key":"26_CR9","doi-asserted-by":"crossref","unstructured":"Cai, Y., et\u00a0al.: Learning progressive joint propagation for human motion prediction. In: ECCV, pp. 226\u2013242 (2020)","DOI":"10.1007\/978-3-030-58571-6_14"},{"key":"26_CR10","doi-asserted-by":"crossref","unstructured":"Chai, J., Hodgins, J.K.: Constraint-based motion optimization using a statistical dynamic model. SIGGRAPH, 8\u2013es (2007)","DOI":"10.1145\/1275808.1276387"},{"key":"26_CR11","doi-asserted-by":"crossref","unstructured":"Chen, X., et al.: Executing your commands via motion diffusion in latent space. In: CVPR, pp. 18000\u201318010 (2023)","DOI":"10.1109\/CVPR52729.2023.01726"},{"key":"26_CR12","doi-asserted-by":"crossref","unstructured":"Chiu, H.K., Adeli, E., Wang, B., Huang, D.A., Niebles, J.C.: Action-agnostic human pose forecasting. In: WACV, pp. 1423\u20131432 (2019)","DOI":"10.1109\/WACV.2019.00156"},{"key":"26_CR13","doi-asserted-by":"crossref","unstructured":"Cho, K., et al.: Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078 (2014)","DOI":"10.3115\/v1\/D14-1179"},{"key":"26_CR14","unstructured":"Cong, P., et al.: LaserHuman: language-guided scene-aware human motion generation in free environment. arXiv preprint arXiv:2403.13307 (2024)"},{"key":"26_CR15","doi-asserted-by":"crossref","unstructured":"Dabral, R., Mughal, M.H., Golyanik, V., Theobalt, C.: MoFusion: a framework for denoising-diffusion-based motion synthesis. In: CVPR, pp. 9760\u20139770 (2023)","DOI":"10.1109\/CVPR52729.2023.00941"},{"key":"26_CR16","doi-asserted-by":"crossref","unstructured":"Dai, W., Chen, L.H., Wang, J., Liu, J., Dai, B., Tang, Y.: MotionLCM: real-time controllable motion generation via latent consistency model. arXiv preprint arXiv:2404.19759 (2024)","DOI":"10.1007\/978-3-031-72640-8_22"},{"key":"26_CR17","doi-asserted-by":"crossref","unstructured":"Dang, L., Nie, Y., Long, C., Zhang, Q., Li, G.: MSR-GCN: multi-scale residual graph convolution networks for human motion prediction. In: ICCV, pp. 11467\u201311476 (2021)","DOI":"10.1109\/ICCV48922.2021.01127"},{"key":"26_CR18","doi-asserted-by":"crossref","unstructured":"Delmas, G., Weinzaepfel, P., Lucas, T., Moreno-Noguer, F., Rogez, G.: PoseScript: 3D human poses from natural language. In: ECCV, pp. 346\u2013362 (2022)","DOI":"10.1007\/978-3-031-20068-7_20"},{"key":"26_CR19","unstructured":"Duan, Y., et al.: Single-shot motion completion with transformer. arXiv preprint arXiv:2103.00776 (2021)"},{"key":"26_CR20","unstructured":"Elgammal, A., Lee, C.S.: Inferring 3D body pose from silhouettes using activity manifold learning. In: CVPR (2004)"},{"issue":"3","key":"26_CR21","doi-asserted-by":"publisher","first-page":"268","DOI":"10.1109\/PROC.1973.9030","volume":"61","author":"GD Forney","year":"1973","unstructured":"Forney, G.D.: The viterbi algorithm. IEEE 61(3), 268\u2013278 (1973)","journal-title":"IEEE"},{"key":"26_CR22","doi-asserted-by":"crossref","unstructured":"Fragkiadaki, K., Levine, S., Felsen, P., Malik, J.: Recurrent network models for human dynamics. In: ICCV, pp. 4346\u20134354 (2015)","DOI":"10.1109\/ICCV.2015.494"},{"issue":"11","key":"26_CR23","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1145\/3422622","volume":"63","author":"I Goodfellow","year":"2020","unstructured":"Goodfellow, I., et al.: Generative adversarial networks. Commun. ACM 63(11), 139\u2013144 (2020)","journal-title":"Commun. ACM"},{"key":"26_CR24","doi-asserted-by":"crossref","unstructured":"Gopalakrishnan, A., Mali, A., Kifer, D., Giles, L., Ororbia, A.G.: A neural temporal model for human motion prediction. In: CVPR, pp. 12116\u201312125 (2019)","DOI":"10.1109\/CVPR.2019.01239"},{"key":"26_CR25","doi-asserted-by":"crossref","unstructured":"Gui, L.Y., Wang, Y.X., Liang, X., Moura, J.M.: Adversarial geometry-aware human motion prediction. In: ECCV, pp. 786\u2013803 (2018)","DOI":"10.1007\/978-3-030-01225-0_48"},{"key":"26_CR26","doi-asserted-by":"crossref","unstructured":"Guo, C., et al.: Generating diverse and natural 3D human motions from text. In: CVPR, pp. 5152\u20135161 (2022)","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"26_CR27","doi-asserted-by":"crossref","unstructured":"Guo, C., et al.: Action2Motion: conditioned generation of 3D human motions. In: ACM MM, pp. 2021\u20132029 (2020)","DOI":"10.1145\/3394171.3413635"},{"key":"26_CR28","unstructured":"Hao, Y., Chi, Z., Dong, L., Wei, F.: Optimizing prompts for text-to-image generation. arXiv preprint arXiv:2212.09611 (2022)"},{"key":"26_CR29","doi-asserted-by":"crossref","unstructured":"Harvey, F.G., Pal, C.: Recurrent transition networks for character locomotion. SIGGRAPH Asia, 1\u20134 (2018)","DOI":"10.1145\/3283254.3283277"},{"key":"26_CR30","doi-asserted-by":"crossref","unstructured":"Hernandez, A., Gall, J., Moreno-Noguer, F.: Human motion prediction via spatio-temporal inpainting. In: ICCV, pp. 7134\u20137143 (2019)","DOI":"10.1109\/ICCV.2019.00723"},{"key":"26_CR31","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. NIPS 33, 6840\u20136851 (2020)","journal-title":"NIPS"},{"key":"26_CR32","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)"},{"key":"26_CR33","doi-asserted-by":"crossref","unstructured":"Hong, F., Zhang, M., Pan, L., Cai, Z., Yang, L., Liu, Z.: AvatarCLIP: zero-shot text-driven generation and animation of 3D avatars. arXiv preprint arXiv:2205.08535 (2022)","DOI":"10.1145\/3528223.3530094"},{"key":"26_CR34","unstructured":"Hong, S., Seo, J., Hong, S., Shin, H., Kim, S.: Large language models are frame-level directors for zero-shot text-to-video generation. arXiv preprint arXiv:2305.14330 (2023)"},{"key":"26_CR35","doi-asserted-by":"crossref","unstructured":"Huang, C., Mees, O., Zeng, A., Burgard, W.: Visual language maps for robot navigation. In: ICRA, pp. 10608\u201310615 (2023)","DOI":"10.1109\/ICRA48891.2023.10160969"},{"key":"26_CR36","unstructured":"Huang, W., Abbeel, P., Pathak, D., Mordatch, I.: Language models as zero-shot planners: extracting actionable knowledge for embodied agents. In: ICML, pp. 9118\u20139147 (2022)"},{"key":"26_CR37","doi-asserted-by":"crossref","unstructured":"Jain, A., Zamir, A.R., Savarese, S., Saxena, A.: Structural-RNN: deep learning on spatio-temporal graphs. In: CVPR, pp. 5308\u20135317 (2016)","DOI":"10.1109\/CVPR.2016.573"},{"key":"26_CR38","doi-asserted-by":"crossref","unstructured":"Ji, Y., Xu, F., Yang, Y., Shen, F., Shen, H.T., Zheng, W.S.: A large-scale RGB-D database for arbitrary-view human action recognition. In: ACM MM, pp. 1510\u20131518 (2018)","DOI":"10.1145\/3240508.3240675"},{"key":"26_CR39","doi-asserted-by":"crossref","unstructured":"Kaufmann, M., Aksan, E., Song, J., Pece, F., Ziegler, R., Hilliges, O.: Convolutional autoencoders for human motion infilling. In: 3DV, pp. 918\u2013927 (2020)","DOI":"10.1109\/3DV50981.2020.00102"},{"key":"26_CR40","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational Bayes. arXiv preprint arXiv:1312.6114 (2013)"},{"key":"26_CR41","unstructured":"Li, K., et al.: VideoChat: chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)"},{"key":"26_CR42","doi-asserted-by":"crossref","unstructured":"Li, R., et al.: FineDance: a fine-grained choreography dataset for 3D full body dance generation. In: CVPR (2023)","DOI":"10.1109\/ICCV51070.2023.00939"},{"key":"26_CR43","unstructured":"Lin, H., Zala, A., Cho, J., Bansal, M.: VideoDirectorGPT: consistent multi-scene video generation via LLM-guided planning. arXiv preprint arXiv:2309.15091 (2023)"},{"key":"26_CR44","unstructured":"Lin, J., et al.: Motion-X: a large-scale 3D expressive whole-body human motion dataset. In: Advances in Neural Information Processing Systems (2023)"},{"key":"26_CR45","doi-asserted-by":"crossref","unstructured":"Lin, J., et al.: Being comes from not-being: open-vocabulary text-to-motion generation with wordless training. In: CVPR, pp. 23222\u201323231 (2023)","DOI":"10.1109\/CVPR52729.2023.02224"},{"key":"26_CR46","unstructured":"Lin, X., Amer, M.R.: Human motion modeling using dvgans. arXiv preprint arXiv:1804.10652 (2018)"},{"issue":"6","key":"26_CR47","first-page":"2133","volume":"31","author":"X Liu","year":"2020","unstructured":"Liu, X., Yin, J., Liu, J., Ding, P., Liu, J., Liu, H.: TrajectoryCNN: a new spatio-temporal feature learning network for human motion prediction. TCSVT 31(6), 2133\u20132146 (2020)","journal-title":"TCSVT"},{"key":"26_CR48","unstructured":"Liu, Z., et\u00a0al.: InternChat: solving vision-centric tasks by interacting with chatbots beyond language. arXiv preprint arXiv:2305.05662 (2023)"},{"key":"26_CR49","doi-asserted-by":"crossref","unstructured":"Loper, M., Mahmood, N., Romero, J., Pons-Moll, G., Black, M.J.: SMPL: a skinned multi-person linear model. SIGGRAPH Asia (2015)","DOI":"10.1145\/2816795.2818013"},{"key":"26_CR50","unstructured":"Lu, Y., et al.: Neuro-symbolic procedural planning with commonsense prompting. arXiv preprint arXiv:2206.02928 (2022)"},{"key":"26_CR51","doi-asserted-by":"crossref","unstructured":"Mahmood, N., Ghorbani, N., Troje, N.F., Pons-Moll, G., Black, M.J.: AMASS: archive of motion capture as surface shapes. In: ICCV, pp. 5442\u20135451 (2019)","DOI":"10.1109\/ICCV.2019.00554"},{"key":"26_CR52","doi-asserted-by":"crossref","unstructured":"Martinez, J., Black, M.J., Romero, J.: On human motion prediction using recurrent neural networks. In: CVPR, pp. 2891\u20132900 (2017)","DOI":"10.1109\/CVPR.2017.497"},{"issue":"1","key":"26_CR53","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/1640443.1640452","volume":"29","author":"J Min","year":"2009","unstructured":"Min, J., Chen, Y.L., Chai, J.: Interactive generation of human animation with deformable motion models. TOG 29(1), 1\u201312 (2009)","journal-title":"TOG"},{"key":"26_CR54","doi-asserted-by":"crossref","unstructured":"Ngo, J.T., Marks, J.: Spacetime constraints revisited. SIGGRAPH, 343\u2013350 (1993)","DOI":"10.1145\/166117.166160"},{"key":"26_CR55","unstructured":"Nichol, A.Q., Dhariwal, P.: Improved denoising diffusion probabilistic models. In: ICML, pp. 8162\u20138171 (2021)"},{"key":"26_CR56","unstructured":"Ouyang, L., et\u00a0al.: Training language models to follow instructions with human feedback. In: NIPS, pp. 27730\u201327744 (2022)"},{"key":"26_CR57","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., et al.: Expressive body capture: 3D hands, face, and body from a single image. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.01123"},{"key":"26_CR58","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., et al.: Expressive body capture: 3D hands, face, and body from a single image. In: CVPR, pp. 10975\u201310985 (2019)","DOI":"10.1109\/CVPR.2019.01123"},{"key":"26_CR59","doi-asserted-by":"publisher","first-page":"855","DOI":"10.1007\/s11263-019-01245-6","volume":"128","author":"D Pavllo","year":"2020","unstructured":"Pavllo, D., Feichtenhofer, C., Auli, M., Grangier, D.: Modeling human motion with quaternion-based neural networks. IJCV 128, 855\u2013872 (2020)","journal-title":"IJCV"},{"key":"26_CR60","unstructured":"Pavllo, D., Grangier, D., Auli, M.: QuaterNet: a quaternion-based recurrent model for human motion. arXiv preprint arXiv:1805.06485 (2018)"},{"key":"26_CR61","doi-asserted-by":"crossref","unstructured":"Petrovich, M., Black, M.J., Varol, G.: Action-conditioned 3D human motion synthesis with transformer VAE. In: ICCV, pp. 10985\u201310995 (2021)","DOI":"10.1109\/ICCV48922.2021.01080"},{"key":"26_CR62","doi-asserted-by":"crossref","unstructured":"Petrovich, M., Black, M.J., Varol, G.: TEMOS: generating diverse human motions from textual descriptions. In: ECCV, pp. 480\u2013497 (2022)","DOI":"10.1007\/978-3-031-20047-2_28"},{"key":"26_CR63","doi-asserted-by":"crossref","unstructured":"Petrovich, M., Black, M.J., Varol, G.: TMR: text-to-motion retrieval using contrastive 3D human motion synthesis. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00870"},{"issue":"4","key":"26_CR64","doi-asserted-by":"publisher","first-page":"236","DOI":"10.1089\/big.2016.0028","volume":"4","author":"M Plappert","year":"2016","unstructured":"Plappert, M., Mandery, C., Asfour, T.: The KIT motion-language dataset. Big Data 4(4), 236\u2013252 (2016)","journal-title":"Big Data"},{"key":"26_CR65","doi-asserted-by":"crossref","unstructured":"Punnakkal, A.R., Chandrasekaran, A., Athanasiou, N., Quiros-Ramirez, A., Black, M.J.: BABEL: Bodies, action and behavior with English labels. In: CVPR, pp. 722\u2013731 (2021)","DOI":"10.1109\/CVPR46437.2021.00078"},{"issue":"6","key":"26_CR66","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3550454.3555454","volume":"41","author":"J Qin","year":"2022","unstructured":"Qin, J., Zheng, Y., Zhou, K.: Motion in-betweening via two-stage transformers. TOG 41(6), 1\u201316 (2022)","journal-title":"TOG"},{"key":"26_CR67","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML, pp. 8748\u20138763 (2021)"},{"key":"26_CR68","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with CLIP latents. arXiv preprint arXiv:2204.06125 (2022)"},{"key":"26_CR69","doi-asserted-by":"crossref","unstructured":"Reimers, N., Gurevych, I.: Sentence-BERT: sentence embeddings using Siamese BERT-networks. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing (2019)","DOI":"10.18653\/v1\/D19-1410"},{"key":"26_CR70","doi-asserted-by":"crossref","unstructured":"Reimers, N., Gurevych, I.: Making monolingual sentence embeddings multilingual using knowledge distillation. In: EMNLP (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.365"},{"key":"26_CR71","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"26_CR72","doi-asserted-by":"crossref","unstructured":"Romero, J., Tzionas, D., Black, M.J.: Embodied Hands: modeling and capturing hands and bodies together. SIGGRAPH Asia 36(6) (2017)","DOI":"10.1145\/3130800.3130883"},{"key":"26_CR73","doi-asserted-by":"crossref","unstructured":"Rose, C., Guenter, B., Bodenheimer, B., Cohen, M.F.: Efficient generation of motion transitions using spacetime constraints. SIGGRAPH, 147\u2013154 (1996)","DOI":"10.1145\/237170.237229"},{"key":"26_CR74","unstructured":"Sanh, V., Debut, L., Chaumond, J., Wolf, T.: DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)"},{"key":"26_CR75","unstructured":"Shafir, Y., Tevet, G., Kapon, R., Bermano, A.H.: Human motion diffusion as a generative prior. arXiv preprint arXiv:2303.01418 (2023)"},{"key":"26_CR76","doi-asserted-by":"crossref","unstructured":"Shahroudy, A., Liu, J., Ng, T.T., Wang, G.: NTU RGB+ D: a large scale dataset for 3D human activity analysis. In: CVPR, pp. 1010\u20131019 (2016)","DOI":"10.1109\/CVPR.2016.115"},{"key":"26_CR77","unstructured":"Shen, Y., Song, K., Tan, X., Li, D., Lu, W., Zhuang, Y.: HuggingGPT: solving AI tasks with ChatGPT and its friends in huggingface. arXiv preprint arXiv:2303.17580 (2023)"},{"key":"26_CR78","doi-asserted-by":"crossref","unstructured":"Singh, I., et al.: ProgPrompt: generating situated robot task plans using large language models. In: ICRA, pp. 11523\u201311530 (2023)","DOI":"10.1109\/ICRA48891.2023.10161317"},{"key":"26_CR79","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: ICML, pp. 2256\u20132265 (2015)"},{"key":"26_CR80","doi-asserted-by":"crossref","unstructured":"Song, C.H., Wu, J., Washington, C., Sadler, B.M., Chao, W.L., Su, Y.: LLM-Planner: few-shot grounded planning for embodied agents with large language models. In: ICCV, pp. 2998\u20133009 (2023)","DOI":"10.1109\/ICCV51070.2023.00280"},{"issue":"4","key":"26_CR81","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3528223.3530090","volume":"41","author":"X Tang","year":"2022","unstructured":"Tang, X., et al.: Real-time controllable motion transition for characters. TOG 41(4), 1\u201310 (2022)","journal-title":"TOG"},{"key":"26_CR82","doi-asserted-by":"crossref","unstructured":"Tang, Y., et al.: FLAG3D: a 3D fitness activity dataset with language instruction. In: CVPR, pp. 22106\u201322117 (2023)","DOI":"10.1109\/CVPR52729.2023.02117"},{"key":"26_CR83","doi-asserted-by":"crossref","unstructured":"Tevet, G., Gordon, B., Hertz, A., Bermano, A.H., Cohen-Or, D.: MotionCLIP: exposing human motion generation to clip space. In: ECCV, pp. 358\u2013374 (2022)","DOI":"10.1007\/978-3-031-20047-2_21"},{"key":"26_CR84","unstructured":"Tevet, G., Raab, S., Gordon, B., Shafir, Y., Cohen-Or, D., Bermano, A.H.: Human motion diffusion model. arXiv preprint arXiv:2209.14916 (2022)"},{"key":"26_CR85","unstructured":"Touvron, H., et\u00a0al.: LLaMA: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"26_CR86","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NIPS (2017)"},{"key":"26_CR87","unstructured":"Wan, W., Dou, Z., Komura, T., Wang, W., Jayaraman, D., Liu, L.: TLControl: trajectory and language control for human motion synthesis. arXiv preprint arXiv:2311.17135 (2023)"},{"issue":"2","key":"26_CR88","doi-asserted-by":"publisher","first-page":"283","DOI":"10.1109\/TPAMI.2007.1167","volume":"30","author":"JM Wang","year":"2007","unstructured":"Wang, J.M., Fleet, D.J., Hertzmann, A.: Gaussian process dynamical models for human motion. TPAMI 30(2), 283\u2013298 (2007)","journal-title":"TPAMI"},{"issue":"4","key":"26_CR89","doi-asserted-by":"publisher","first-page":"159","DOI":"10.1145\/378456.378507","volume":"22","author":"A Witkin","year":"1988","unstructured":"Witkin, A., Kass, M.: Spacetime constraints. SIGGRAPH 22(4), 159\u2013168 (1988)","journal-title":"SIGGRAPH"},{"key":"26_CR90","unstructured":"Wu, C., Yin, S., Qi, W., Wang, X., Tang, Z., Duan, N.: Visual ChatGPT: talking, drawing and editing with visual foundation models. arXiv preprint arXiv:2303.04671 (2023)"},{"key":"26_CR91","unstructured":"Xiao, Z., et al.: Unified human-scene interaction via prompted chain-of-contacts. arXiv preprint arXiv:2309.07918 (2023)"},{"key":"26_CR92","unstructured":"Yang, Z., et al.: MM-REACT: prompting ChatGPT for multimodal reasoning and action. arXiv preprint arXiv:2303.11381 (2023)"},{"key":"26_CR93","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Song, J., Iqbal, U., Vahdat, A., Kautz, J.: PhysDiff: physics-guided human motion diffusion model. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01467"},{"key":"26_CR94","unstructured":"Zeng, A., et\u00a0al.: GLM-130B: an open bilingual pre-trained model. arXiv preprint arXiv:2210.02414 (2022)"},{"key":"26_CR95","unstructured":"Zhang, B., et al.: RodinHD: high-fidelity 3D avatar generation with diffusion models (2024)"},{"key":"26_CR96","unstructured":"Zhang, B., et al.: GaussianCube: structuring gaussian splatting using optimal transport for 3D generative modeling. arXiv preprint arXiv:2403.19655 (2024)"},{"key":"26_CR97","unstructured":"Zhang, M., et al.: MotionDiffuse: text-driven human motion generation with diffusion model. arXiv preprint arXiv:2208.15001 (2022)"},{"key":"26_CR98","doi-asserted-by":"crossref","unstructured":"Zhang, M., et al.: ReMoDiffuse: retrieval-augmented motion diffusion model. arXiv preprint arXiv:2304.01116 (2023)","DOI":"10.1109\/ICCV51070.2023.00040"},{"key":"26_CR99","unstructured":"Zhang, S., et\u00a0al.: OPT: open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)"},{"key":"26_CR100","doi-asserted-by":"crossref","unstructured":"Zhang, X., van\u00a0de Panne, M.: Data-driven autocompletion for keyframe animation. SIGGRAPH, 1\u201311 (2018)","DOI":"10.1145\/3274247.3274502"},{"key":"26_CR101","unstructured":"Zheng, K., et al.: JARVIS: a neuro-symbolic commonsense reasoning framework for conversational embodied agents. arXiv preprint arXiv:2208.13266 (2022)"},{"key":"26_CR102","doi-asserted-by":"crossref","unstructured":"Zhou, W., et al.: EMDM: efficient motion diffusion model for fast, high-quality motion generation. arXiv preprint arXiv:2312.02256 (2023)","DOI":"10.1007\/978-3-031-72627-9_2"},{"key":"26_CR103","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Barnes, C., Lu, J., Yang, J., Li, H.: On the continuity of rotation representations in neural networks. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00589"},{"key":"26_CR104","unstructured":"Zhou, Y., Lu, J., Barnes, C., Yang, J., Xiang, S., et\u00a0al.: Generative Tweening: long-term inbetweening of 3D human motions. arXiv preprint arXiv:2005.08891 (2020)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73383-3_26","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T12:12:32Z","timestamp":1730549552000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73383-3_26"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031733826","9783031733833"],"references-count":104,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73383-3_26","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}