{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T18:14:55Z","timestamp":1771956895989,"version":"3.50.1"},"publisher-location":"Cham","reference-count":125,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031733468","type":"print"},{"value":"9783031733475","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:00:00Z","timestamp":1730160000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:00:00Z","timestamp":1730160000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73347-5_4","type":"book-chapter","created":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T09:15:43Z","timestamp":1730106943000},"page":"54-74","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["MotionChain: Conversational Motion Controllers via\u00a0Multimodal Prompts"],"prefix":"10.1007","author":[{"given":"Biao","family":"Jiang","sequence":"first","affiliation":[]},{"given":"Xin","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Chi","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Fukun","family":"Yin","sequence":"additional","affiliation":[]},{"given":"Zhuoyuan","family":"Li","sequence":"additional","affiliation":[]},{"given":"Gang","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Jiayuan","family":"Fan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,29]]},"reference":[{"key":"4_CR1","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)"},{"key":"4_CR2","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Vision-and-language navigation: interpreting visually-grounded navigation instructions in real environments. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3674\u20133683 (2018)","DOI":"10.1109\/CVPR.2018.00387"},{"key":"4_CR3","doi-asserted-by":"crossref","unstructured":"Athanasiou, N., Petrovich, M., Black, M.J., Varol, G.: Teach: temporal action compositions for 3d humans. In: International Conference on 3D Vision (3DV) (2022)","DOI":"10.1109\/3DV57658.2022.00053"},{"key":"4_CR4","unstructured":"Bazavan, E.G., Zanfir, A., Zanfir, M., Freeman, W.T., Sukthankar, R., Sminchisescu, C.: Hspace: synthetic parametric humans animated in complex environments. arXiv preprint arXiv:2112.12867 (2021)"},{"key":"4_CR5","doi-asserted-by":"crossref","unstructured":"Black, M.J., Patel, P., Tesch, J., Yang, J.: Bedlam: a synthetic dataset of bodies exhibiting detailed lifelike animated motion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8726\u20138737 (2023)","DOI":"10.1109\/CVPR52729.2023.00843"},{"key":"4_CR6","doi-asserted-by":"crossref","unstructured":"Borsos, Z., et al.: Audiolm: a language modeling approach to audio generation. IEEE\/ACM Trans. Audio Speech Lang. Process. 31, 2523\u20132533 (2023)","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"4_CR7","doi-asserted-by":"crossref","unstructured":"Brooks, T., Holynski, A., Efros, A.A.: Instructpix2pix: learning to follow image editing instructions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18392\u201318402 (2023)","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"4_CR8","unstructured":"Cai, Z., et al.: Playing for 3d human recovery. arXiv preprint arXiv:2110.07588 (2021)"},{"key":"4_CR9","doi-asserted-by":"crossref","unstructured":"Cao, X., Chen, Z., Chen, A., Chen, X., Li, S., Yu, J.: Sparse photometric 3d face reconstruction guided by morphable models. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4635\u20134644 (2018)","DOI":"10.1109\/CVPR.2018.00487"},{"key":"4_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"4_CR11","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., Soricut, R.: Conceptual 12m: pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3558\u20133568 (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"4_CR12","doi-asserted-by":"crossref","unstructured":"Chen, S., et al.: Ll3da: visual interactive instruction tuning for omni-3d understanding reasoning and planning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26428\u201326438 (2024)","DOI":"10.1109\/CVPR52733.2024.02496"},{"key":"4_CR13","doi-asserted-by":"crossref","unstructured":"Choudhury, R., Kitani, K.M., Jeni, L.A.: Tempo: efficient multi-view pose estimation, tracking, and forecasting. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 14750\u201314760 (2023)","DOI":"10.1109\/ICCV51070.2023.01355"},{"key":"4_CR14","unstructured":"Chung, H.W., et\u00a0al.: Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)"},{"key":"4_CR15","unstructured":"Clavet, S.: Motion matching and the road to next-gen animation. In: Proceedings of the GDC, vol.\u00a02, p.\u00a09 (2016)"},{"key":"4_CR16","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"4_CR17","doi-asserted-by":"crossref","unstructured":"Gilardi, F., Alizadeh, M., Kubli, M.: Chatgpt outperforms crowd-workers for text-annotation tasks. arXiv preprint arXiv:2303.15056 (2023)","DOI":"10.1073\/pnas.2305016120"},{"key":"4_CR18","doi-asserted-by":"crossref","unstructured":"Girdhar, R., et al.: Imagebind: one embedding space to bind them all. arXiv preprint arXiv:2305.05665 (2023)","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"4_CR19","doi-asserted-by":"crossref","unstructured":"Goel, S., Pavlakos, G., Rajasegaran, J., Kanazawa, A., Malik, J.: Humans in 4d: reconstructing and tracking humans with transformers. arXiv preprint arXiv:2305.20091 (2023)","DOI":"10.1109\/ICCV51070.2023.01358"},{"key":"4_CR20","doi-asserted-by":"crossref","unstructured":"Goutsu, Y., Inamura, T.: Linguistic descriptions of human motion with generative adversarial seq2seq learning. In: 2021 IEEE International Conference on Robotics and Automation (ICRA), pp. 4281\u20134287. IEEE (2021)","DOI":"10.1109\/ICRA48506.2021.9561519"},{"key":"4_CR21","doi-asserted-by":"crossref","unstructured":"Guler, R.A., Kokkinos, I.: Holopose: holistic 3d human reconstruction in-the-wild. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10884\u201310894 (2019)","DOI":"10.1109\/CVPR.2019.01114"},{"key":"4_CR22","doi-asserted-by":"crossref","unstructured":"Guo, C., Mu, Y., Javed, M.G., Wang, S., Cheng, L.: Momask: generative masked modeling of 3d human motions. arXiv preprint arXiv:2312.00063 (2023)","DOI":"10.1109\/CVPR52733.2024.00186"},{"key":"4_CR23","doi-asserted-by":"crossref","unstructured":"Guo, C., et al.: Generating diverse and natural 3d human motions from text. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5152\u20135161 (2022)","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"4_CR24","doi-asserted-by":"crossref","unstructured":"Guo, C., Zuo, X., Wang, S., Cheng, L.: Tm2t: stochastic and tokenized modeling for the reciprocal generation of 3d human motions and texts. In: ECCV (2022)","DOI":"10.1007\/978-3-031-19833-5_34"},{"key":"4_CR25","doi-asserted-by":"crossref","unstructured":"Guo, C., et al.: Action2motion: conditioned generation of 3d human motions. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 2021\u20132029 (2020)","DOI":"10.1145\/3394171.3413635"},{"key":"4_CR26","unstructured":"Guo, Z., et\u00a0al.: Point-bind and point-llm: aligning point cloud with multi-modality for 3d understanding, generation, and instruction following. arXiv preprint arXiv:2309.00615 (2023)"},{"key":"4_CR27","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"4_CR28","unstructured":"Hong, Y., et al.: 3d-llm: injecting the 3d world into large language models. arXiv preprint arXiv:2307.12981 (2023)"},{"key":"4_CR29","unstructured":"Huang, S., et\u00a0al.: Language is not all you need: aligning perception with language models. arXiv preprint arXiv:2302.14045 (2023)"},{"key":"4_CR30","doi-asserted-by":"crossref","unstructured":"Ionescu, C., Papava, D., Olaru, V., Sminchisescu, C.: Human3. 6m: large scale datasets and predictive methods for 3d human sensing in natural environments. IEEE Trans. Pattern Anal. Mach. Intell. 36(7), 1325\u20131339 (2013)","DOI":"10.1109\/TPAMI.2013.248"},{"key":"4_CR31","unstructured":"Jaegle, A., Gimeno, F., Brock, A., Vinyals, O., Zisserman, A., Carreira, J.: Perceiver: general perception with iterative attention. In: International Conference on Machine Learning, pp. 4651\u20134664. PMLR (2021)"},{"key":"4_CR32","unstructured":"Jiang, B., Chen, X., Liu, W., Yu, J., Yu, G., Chen, T.: Motiongpt: human motion as a foreign language. arXiv preprint arXiv:2306.14795 (2023)"},{"key":"4_CR33","doi-asserted-by":"crossref","unstructured":"Kanazawa, A., Black, M.J., Jacobs, D.W., Malik, J.: End-to-end recovery of human shape and pose. In: Computer Vision and Pattern Recognition (CVPR) (2018)","DOI":"10.1109\/CVPR.2018.00744"},{"key":"4_CR34","doi-asserted-by":"crossref","unstructured":"Karras, T., Aila, T., Laine, S., Herva, A., Lehtinen, J.: Audio-driven facial animation by joint end-to-end learning of pose and emotion. ACM Trans. Graph. 36(4), 1\u201312 (2017)","DOI":"10.1145\/3072959.3073658"},{"key":"4_CR35","doi-asserted-by":"crossref","unstructured":"Karunratanakul, K., Preechakul, K., Suwajanakorn, S., Tang, S.: Guided motion diffusion for controllable human motion synthesis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2151\u20132162 (2023)","DOI":"10.1109\/ICCV51070.2023.00205"},{"key":"4_CR36","doi-asserted-by":"crossref","unstructured":"Kocabas, M., Athanasiou, N., Black, M.J.: Vibe: video inference for human body pose and shape estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.00530"},{"key":"4_CR37","doi-asserted-by":"crossref","unstructured":"Kocabas, M., Huang, C.H.P., Tesch, J., M\u00fcller, L., Hilliges, O., Black, M.J.: Spec: seeing people in the wild with an estimated camera. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11035\u201311045 (2021)","DOI":"10.1109\/ICCV48922.2021.01085"},{"key":"4_CR38","doi-asserted-by":"crossref","unstructured":"Kovar, L., Gleicher, M., Pighin, F.: Motion graphs. In: Seminal Graphics Papers: Pushing the Boundaries, vol. 2, pp. 723\u2013732 (2023)","DOI":"10.1145\/3596711.3596788"},{"key":"4_CR39","doi-asserted-by":"crossref","unstructured":"Kudo, T., Richardson, J.: Sentencepiece: a simple and language independent subword tokenizer and detokenizer for neural text processing. arXiv preprint arXiv:1808.06226 (2018)","DOI":"10.18653\/v1\/D18-2012"},{"key":"4_CR40","doi-asserted-by":"crossref","unstructured":"Lee, T., Moon, G., Lee, K.M.: Multiact: long-term 3d human motion generation from multiple action labels. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a037, pp. 1231\u20131239 (2023)","DOI":"10.1609\/aaai.v37i1.25206"},{"key":"4_CR41","doi-asserted-by":"crossref","unstructured":"Lee, Y., Wampler, K., Bernstein, G., Popovi\u0107, J., Popovi\u0107, Z.: Motion fields for interactive character locomotion. In: ACM SIGGRAPH Asia 2010 Papers, pp.\u00a01\u20138 (2010)","DOI":"10.1145\/1833349.1778859"},{"key":"4_CR42","doi-asserted-by":"crossref","unstructured":"Levine, S., Wang, J.M., Haraux, A., Popovi\u0107, Z., Koltun, V.: Continuous character control with low-dimensional embeddings. ACM Trans. Graph. 31(4), 1\u201310 (2012)","DOI":"10.1145\/2185520.2335379"},{"key":"4_CR43","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"4_CR44","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"4_CR45","unstructured":"Li, M., et al.: M3dbench: let\u2019s instruct large models with multi-modal 3d prompts. arXiv preprint arXiv:2312.10763 (2023)"},{"key":"4_CR46","doi-asserted-by":"crossref","unstructured":"Li, Y., Wu, M., Zhang, Y., Xu, L., Yu, J.: Piano: a parametric hand bone model from magnetic resonance imaging. arXiv preprint arXiv:2106.10893 (2021)","DOI":"10.24963\/ijcai.2021\/113"},{"key":"4_CR47","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Nimble: a non-rigid hand model with bones and muscles. ACM Trans. Graph. 41(4), 1\u201316 (2022)","DOI":"10.1145\/3528223.3530079"},{"key":"4_CR48","unstructured":"Lin, C.Y.: Rouge: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"4_CR49","unstructured":"Lin, J., et al.: Motion-x: a large-scale 3d expressive whole-body human motion dataset. arXiv preprint arXiv:2307.00818 (2023)"},{"key":"4_CR50","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)"},{"key":"4_CR51","unstructured":"Liu, X., Yan, H., Zhang, S., An, C., Qiu, X., Lin, D.: Scaling laws of rope-based extrapolation. arXiv preprint arXiv:2310.05209 (2023)"},{"key":"4_CR52","doi-asserted-by":"crossref","unstructured":"Loper, M., Mahmood, N., Romero, J., Pons-Moll, G., Black, M.J.: SMPL: a skinned multi-person linear model. ACM Trans. Graph. 34(6), 248:1\u2013248:16 (2015)","DOI":"10.1145\/2816795.2818013"},{"key":"4_CR53","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"4_CR54","doi-asserted-by":"crossref","unstructured":"Lu, C., et al.: A large-scale outdoor multi-modal dataset and benchmark for novel view synthesis and implicit scene reconstruction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7557\u20137567 (2023)","DOI":"10.1109\/ICCV51070.2023.00695"},{"key":"4_CR55","unstructured":"Lu, S., et al.: Humantomato: text-aligned whole-body motion generation. arXiv preprint arXiv:2310.12978 (2023)"},{"key":"4_CR56","doi-asserted-by":"crossref","unstructured":"Ma, H., Li, J., Hosseini, R., Tomizuka, M., Choi, C.: Multi-objective diverse human motion prediction with knowledge distillation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8161\u20138171 (2022)","DOI":"10.1109\/CVPR52688.2022.00799"},{"key":"4_CR57","doi-asserted-by":"crossref","unstructured":"Mahmood, N., Ghorbani, N., Troje, N.F., Pons-Moll, G., Black, M.J.: Amass: archive of motion capture as surface shapes. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00554"},{"key":"4_CR58","doi-asserted-by":"publisher","unstructured":"Mehta, D., et al.: Monocular 3d human pose estimation in the wild using improved CNN supervision. In: International Conference on 3D Vision (3DV) (2017). https:\/\/doi.org\/10.1109\/3dv.2017.00064. http:\/\/gvv.mpi-inf.mpg.de\/3dhp_dataset","DOI":"10.1109\/3dv.2017.00064"},{"key":"4_CR59","doi-asserted-by":"crossref","unstructured":"Min, J., Chai, J.: Motion graphs++ a compact generative model for semantic motion analysis and synthesis. ACM Trans. Graph. 31(6), 1\u201312 (2012)","DOI":"10.1145\/2366145.2366172"},{"key":"4_CR60","unstructured":"OpenAI. Gpt-4 technical report (2023)"},{"key":"4_CR61","unstructured":"Ordonez, V., Kulkarni, G., Berg, T.: Im2text: describing images using 1 million captioned photographs. Adv. Neural Inf. Process. Syst. 24 (2011)"},{"key":"4_CR62","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. Adv. Neural. Inf. Process. Syst. 35, 27730\u201327744 (2022)"},{"key":"4_CR63","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"4_CR64","doi-asserted-by":"crossref","unstructured":"Patel, P., Huang, C.H.P., Tesch, J., Hoffmann, D.T., Tripathi, S., Black, M.J.: Agora: avatars in geography optimized for regression analysis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13468\u201313478 (2021)","DOI":"10.1109\/CVPR46437.2021.01326"},{"key":"4_CR65","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., et al.: Expressive body capture: 3d hands, face, and body from a single image. In: Proceedings IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.01123"},{"key":"4_CR66","doi-asserted-by":"crossref","unstructured":"Petrovich, M., Black, M.J., Varol, G.: Action-conditioned 3D human motion synthesis with transformer VAE. In: International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.01080"},{"key":"4_CR67","doi-asserted-by":"publisher","unstructured":"Petrovich, M., Black, M.J., Varol, G.: TEMOS: generating diverse human motions from\u00a0textual descriptions. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part XXII, pp. 480\u2013497. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20047-2_28","DOI":"10.1007\/978-3-031-20047-2_28"},{"key":"4_CR68","doi-asserted-by":"crossref","unstructured":"Petrovich, M., Black, M.J., Varol, G.: TMR: text-to-motion retrieval using contrastive 3d human motion synthesis. arXiv preprint arXiv:2305.00976 (2023)","DOI":"10.1109\/ICCV51070.2023.00870"},{"key":"4_CR69","doi-asserted-by":"publisher","unstructured":"Plappert, M., Mandery, C., Asfour, T.: The kit motion-language dataset. Big Data 4(4), 236\u2013252 (2016). https:\/\/doi.org\/10.1089\/big.2016.0028","DOI":"10.1089\/big.2016.0028"},{"key":"4_CR70","doi-asserted-by":"publisher","first-page":"13","DOI":"10.1016\/j.robot.2018.07.006","volume":"109","author":"M Plappert","year":"2018","unstructured":"Plappert, M., Mandery, C., Asfour, T.: Learning a bidirectional mapping between human whole-body motion and natural language using deep recurrent neural networks. Robot. Auton. Syst. 109, 13\u201326 (2018)","journal-title":"Robot. Auton. Syst."},{"key":"4_CR71","doi-asserted-by":"crossref","unstructured":"Punnakkal, A.R., Chandrasekaran, A., Athanasiou, N., Quiros-Ramirez, A., Black, M.J.: BABEL: bodies, action and behavior with English labels. In: Proceedings IEEE\/CVF Conference\u00a0on Computer Vision and Pattern Recognition (CVPR), pp. 722\u2013731 (2021)","DOI":"10.1109\/CVPR46437.2021.00078"},{"key":"4_CR72","doi-asserted-by":"crossref","unstructured":"Qiu, Z., et al.: Sculptor: skeleton-consistent face creation using a learned parametric generator. ACM Trans. Graph. 41(6), 1\u201317 (2022)","DOI":"10.1145\/3550454.3555462"},{"key":"4_CR73","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"4_CR74","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(1), 5485\u20135551 (2020)"},{"key":"4_CR75","doi-asserted-by":"crossref","unstructured":"Rajasegaran, J., Pavlakos, G., Kanazawa, A., Malik, J.: Tracking people by predicting 3d appearance, location and pose. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2740\u20132749 (2022)","DOI":"10.1109\/CVPR52688.2022.00276"},{"key":"4_CR76","unstructured":"Razavi, A., Van\u00a0den Oord, A., Vinyals, O.: Generating diverse high-fidelity images with VQ-VAE-2. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"4_CR77","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2022). https:\/\/github.com\/CompVis\/latent-diffusion. https:\/\/arxiv.org\/abs\/2112.10752","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"4_CR78","unstructured":"Romero, J., Tzionas, D., Black, M.J.: Embodied hands: modeling and capturing hands and bodies together. arXiv preprint arXiv:2201.02610 (2022)"},{"key":"4_CR79","doi-asserted-by":"crossref","unstructured":"Rose, C., Cohen, M.F., Bodenheimer, B.: Verbs and adverbs: multidimensional motion interpolation. IEEE Comput. Graph. Appl. 18(5), 32\u201340 (1998)","DOI":"10.1109\/38.708559"},{"key":"4_CR80","unstructured":"Rubenstein, P.K., et\u00a0al.: Audiopalm: a large language model that can speak and listen. arXiv preprint arXiv:2306.12925 (2023)"},{"key":"4_CR81","unstructured":"Schuhmann, C., et al.: Laion-5b: an open large-scale dataset for training next generation image-text models. Adv. Neural. Inf. Process. Syst. 35, 25278\u201325294 (2022)"},{"key":"4_CR82","unstructured":"Shafir, Y., Tevet, G., Kapon, R., Bermano, A.H.: Human motion diffusion as a generative prior. arXiv preprint arXiv:2303.01418 (2023)"},{"key":"4_CR83","doi-asserted-by":"crossref","unstructured":"Siddiqui, Y., et al.: Meshgpt: generating triangle meshes with decoder-only transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19615\u201319625 (2024)","DOI":"10.1109\/CVPR52733.2024.01855"},{"key":"4_CR84","doi-asserted-by":"crossref","unstructured":"Siyao, L., et al.: Bailando: 3d dance generation by actor-critic GPT with choreographic memory. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11050\u201311059 (2022)","DOI":"10.1109\/CVPR52688.2022.01077"},{"key":"4_CR85","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)"},{"key":"4_CR86","doi-asserted-by":"crossref","unstructured":"Starke, S., Mason, I., Komura, T.: Deepphase: periodic autoencoders for learning motion phase manifolds. ACM Trans. Graph. 41(4), 1\u201313 (2022)","DOI":"10.1145\/3528223.3530178"},{"key":"4_CR87","doi-asserted-by":"crossref","unstructured":"Starke, S., Zhang, H., Komura, T., Saito, J.: Neural state machine for character-scene interactions. ACM Trans. Graph. 38(6), 209 (2019)","DOI":"10.1145\/3355089.3356505"},{"key":"4_CR88","doi-asserted-by":"crossref","unstructured":"Starke, S., Zhao, Y., Zinno, F., Komura, T.: Neural animation layering for synthesizing martial arts movements. ACM Trans. Graph. 40(4), 1\u201316 (2021)","DOI":"10.1145\/3476576.3476651"},{"key":"4_CR89","unstructured":"Szot, A., et\u00a0al.: Habitat 2.0: training home assistants to rearrange their habitat. Adv. Neural Inf. Process. Syst. 34, 251\u2013266 (2021)"},{"key":"4_CR90","doi-asserted-by":"crossref","unstructured":"Takano, W., Nakamura, Y.: Statistical mutual conversion between whole body motion primitives and linguistic sentences for human motions. Int. J. Robot. Res. 34(10), 1314\u20131328 (2015)","DOI":"10.1177\/0278364915587923"},{"key":"4_CR91","doi-asserted-by":"crossref","unstructured":"Tevet, G., Gordon, B., Hertz, A., Bermano, A.H., Cohen-Or, D.: Motionclip: exposing human motion generation to clip space. In: ECCV 2022, Part XXII, pp. 358\u2013374. Springer (2022)","DOI":"10.1007\/978-3-031-20047-2_21"},{"key":"4_CR92","unstructured":"Tevet, G., Raab, S., Gordon, B., Shafir, Y., Bermano, A.H., Cohen-Or, D.: Human motion diffusion model. arXiv preprint arXiv:2209.14916 (2022)"},{"key":"4_CR93","unstructured":"Touvron, H., et\u00a0al.: Llama: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"4_CR94","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"4_CR95","unstructured":"Van Den\u00a0Oord, A., Vinyals, O., et\u00a0al.: Neural discrete representation learning. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"4_CR96","doi-asserted-by":"crossref","unstructured":"Varol, G., et al.: Learning from synthetic humans. In: Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.492"},{"key":"4_CR97","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: Cider: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"4_CR98","unstructured":"Wang, C.: T2m-hifigpt: generating high quality human motion from textual descriptions with residual discrete representations. arXiv preprint arXiv:2312.10628 (2023)"},{"key":"4_CR99","unstructured":"Wang, W., et al.: Neural marionette: a transformer-based multi-action human motion synthesis system. arXiv preprint arXiv:2209.13204 (2022)"},{"key":"4_CR100","unstructured":"Wu, C., Yin, S., Qi, W., Wang, X., Tang, Z., Duan, N.: Visual chatgpt: talking, drawing and editing with visual foundation models. arXiv preprint arXiv:2303.04671 (2023)"},{"key":"4_CR101","doi-asserted-by":"crossref","unstructured":"Wu, Y.C., Gebru, I.D., Markovi\u0107, D., Richard, A.: Audiodec: an open-source streaming high-fidelity neural audio codec. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10096509"},{"key":"4_CR102","unstructured":"Xin, C., et al.: Executing your commands via motion diffusion in latent space. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)"},{"key":"4_CR103","unstructured":"Xiong, W., et\u00a0al.: Effective long-context scaling of foundation models. arXiv preprint arXiv:2309.16039 (2023)"},{"key":"4_CR104","doi-asserted-by":"crossref","unstructured":"Xu, H., et al.: Videoclip: contrastive pre-training for zero-shot video-text understanding. arXiv preprint arXiv:2109.14084 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"4_CR105","doi-asserted-by":"crossref","unstructured":"Yamada, T., Matsunaga, H., Ogata, T.: Paired recurrent autoencoders for bidirectional translation between robot actions and linguistic descriptions. IEEE Robot. Automat. Lett. 3(4), 3441\u20133448 (2018)","DOI":"10.1109\/LRA.2018.2852838"},{"key":"4_CR106","doi-asserted-by":"crossref","unstructured":"Yao, H., Song, Z., Zhou, Y., Ao, T., Chen, B., Liu, L.: Moconvq: unified physics-based motion control via scalable discrete representations (2023)","DOI":"10.1145\/3658137"},{"key":"4_CR107","unstructured":"Ye, Q., et\u00a0al.: mplug-owl: modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)"},{"key":"4_CR108","unstructured":"Yin, F., et al.: Shapegpt: 3d shape generation with a unified multi-modal language model. arXiv preprint arXiv:2311.17618 (2023)"},{"key":"4_CR109","unstructured":"Yin, F., Liu, W., Huang, Z., Cheng, P., Chen, T., Yu, G.: Coordinates are not lonely-codebook prior helps implicit neural 3d representations. Adv. Neural. Inf. Process. Syst. 35, 12705\u201312717 (2022)"},{"key":"4_CR110","unstructured":"Yu, J., et\u00a0al.: Scaling autoregressive models for content-rich text-to-image generation. arXiv preprint arXiv:2206.10789 (2022)"},{"key":"4_CR111","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Kitani, K.: Dlow: Diversifying latent flows for diverse human motion prediction. In: ECCV 2020, Part IX 16, pp. 346\u2013364. Springer (2020)","DOI":"10.1007\/978-3-030-58545-7_20"},{"key":"4_CR112","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Song, J., Iqbal, U., Vahdat, A., Kautz, J.: PhysDiff: physics-guided human motion diffusion model. In: IEEE International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01467"},{"key":"4_CR113","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, X., Bing, L.: Video-llama: an instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"4_CR114","doi-asserted-by":"crossref","unstructured":"Zhang, H., Cao, J., Lu, G., Ouyang, W., Sun, Z.: Danet: decompose-and-aggregate network for 3d human shape and pose estimation. In: Proceedings of the 27th ACM International Conference on Multimedia, pp. 935\u2013944 (2019)","DOI":"10.1145\/3343031.3351057"},{"key":"4_CR115","doi-asserted-by":"crossref","unstructured":"Zhang, H., et al.: Pymaf-x: towards well-aligned full-body model regression from monocular images. IEEE Trans. Pattern Anal. Mach. Intell. (2023)","DOI":"10.1109\/TPAMI.2023.3271691"},{"key":"4_CR116","doi-asserted-by":"crossref","unstructured":"Zhang, J., et al.: T2m-gpt: generating human motion from textual descriptions with discrete representations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.01415"},{"key":"4_CR117","unstructured":"Zhang, M., Cai, Z., Pan, L., Hong, F., Guo, X., Yang, L., Liu, Z.: Motiondiffuse: text-driven human motion generation with diffusion model. arXiv preprint arXiv:2208.15001 (2022)"},{"key":"4_CR118","doi-asserted-by":"crossref","unstructured":"Zhang, M., et al.: Remodiffuse: retrieval-augmented motion diffusion model. arXiv preprint arXiv:2304.01116 (2023)","DOI":"10.1109\/ICCV51070.2023.00040"},{"key":"4_CR119","doi-asserted-by":"crossref","unstructured":"Zhang, Q., Song, J., Huang, X., Chen, Y., Liu, M.Y.: Diffcollage: parallel generation of large content with diffusion models. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10188\u201310198. IEEE (2023)","DOI":"10.1109\/CVPR52729.2023.00982"},{"key":"4_CR120","unstructured":"Zhang, T., Kishore, V., Wu, F., Weinberger, K.Q., Artzi, Y.: Bertscore: evaluating text generation with Bert. arXiv preprint arXiv:1904.09675 (2019)"},{"key":"4_CR121","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Black, M.J., Tang, S.: We are more than our joints: predicting how 3d bodies move. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3372\u20133382 (2021)","DOI":"10.1109\/CVPR46437.2021.00338"},{"key":"4_CR122","unstructured":"Zheng, L., et al.: Judging LLM-as-a-judge with MT-bench and chatbot arena (2023)"},{"key":"4_CR123","unstructured":"Zhou, C., et\u00a0al.: Lima: less is more for alignment. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"4_CR124","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: Minigpt-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"},{"key":"4_CR125","doi-asserted-by":"crossref","unstructured":"Zou, X., et\u00a0al.: Generalized decoding for pixel, image, and language. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15116\u201315127 (2023)","DOI":"10.1109\/CVPR52729.2023.01451"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73347-5_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T09:43:03Z","timestamp":1730108583000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73347-5_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,29]]},"ISBN":["9783031733468","9783031733475"],"references-count":125,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73347-5_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,29]]},"assertion":[{"value":"29 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}