{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T16:43:02Z","timestamp":1779295382826,"version":"3.51.4"},"publisher-location":"Cham","reference-count":167,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726231","type":"print"},{"value":"9783031726248","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72624-8_23","type":"book-chapter","created":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T09:52:13Z","timestamp":1729849933000},"page":"397-421","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":19,"title":["Large Motion Model for\u00a0Unified Multi-modal Motion Generation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8212-715X","authenticated-orcid":false,"given":"Mingyuan","family":"Zhang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3849-2409","authenticated-orcid":false,"given":"Daisheng","family":"Jin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4536-5437","authenticated-orcid":false,"given":"Chenyang","family":"Gu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2412-1141","authenticated-orcid":false,"given":"Fangzhou","family":"Hong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1810-3855","authenticated-orcid":false,"given":"Zhongang","family":"Cai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jingfang","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1378-322X","authenticated-orcid":false,"given":"Chongzhi","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3635-4278","authenticated-orcid":false,"given":"Xinying","family":"Guo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0571-5924","authenticated-orcid":false,"given":"Lei","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6749-4485","authenticated-orcid":false,"given":"Ying","family":"He","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4220-5958","authenticated-orcid":false,"given":"Ziwei","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,26]]},"reference":[{"key":"23_CR1","doi-asserted-by":"crossref","unstructured":"Ahn, H., Mascaro, E.V., Lee, D.: Can we use diffusion probabilistic models for 3D motion prediction? In: 2023 IEEE International Conference on Robotics and Automation (ICRA) (2023)","DOI":"10.1109\/ICRA48891.2023.10160722"},{"key":"23_CR2","doi-asserted-by":"crossref","unstructured":"Ahn, H., Mascaro, E.V., Lee, D.: Can we use diffusion probabilistic models for 3D motion prediction? arXiv preprint arXiv:2302.14503 (2023)","DOI":"10.1109\/ICRA48891.2023.10160722"},{"key":"23_CR3","doi-asserted-by":"crossref","unstructured":"Ahuja, C., Morency, L.P.: Language2Pose: natural language grounded pose forecasting. In: 2019 International Conference on 3D Vision (3DV), pp. 719\u2013728. IEEE (2019)","DOI":"10.1109\/3DV.2019.00084"},{"key":"23_CR4","doi-asserted-by":"crossref","unstructured":"Ao, T., Zhang, Z., Liu, L.: GestureDiffuCLIP: gesture diffusion model with clip latents. ACM Trans. Graph. (2023)","DOI":"10.1145\/3592097"},{"key":"23_CR5","doi-asserted-by":"crossref","unstructured":"Athanasiou, N., Petrovich, M., Black, M.J., Varol, G.: TEACH: temporal action composition for 3D humans. In: 2022 International Conference on 3D Vision (3DV), pp. 414\u2013423. IEEE (2022)","DOI":"10.1109\/3DV57658.2022.00053"},{"key":"23_CR6","doi-asserted-by":"crossref","unstructured":"Athanasiou, N., Petrovich, M., Black, M.J., Varol, G.: SINC: spatial composition of 3d human motions for simultaneous action generation. In: IEEE\/CVF International Conference on Computer Vision, ICCV 2023, Paris, France, 1\u20136 October 2023, pp. 9950\u20139961 (2023)","DOI":"10.1109\/ICCV51070.2023.00916"},{"key":"23_CR7","doi-asserted-by":"crossref","unstructured":"Azadi, S., Shah, A., Hayes, T., Parikh, D., Gupta, S.: Make-an-animation: large-scale text-conditional 3d human motion generation. In: IEEE\/CVF International Conference on Computer Vision, ICCV 2023, Paris, France, 1\u20136 October 2023, pp. 14993\u201315002 (2023)","DOI":"10.1109\/ICCV51070.2023.01381"},{"key":"23_CR8","doi-asserted-by":"crossref","unstructured":"Barquero, G., Escalera, S., Palmero, C.: BeLFusion: latent diffusion for behavior-driven human motion prediction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2317\u20132327 (2023)","DOI":"10.1109\/ICCV51070.2023.00220"},{"key":"23_CR9","doi-asserted-by":"crossref","unstructured":"Barsoum, E., Kender, J., Liu, Z.: HP-GAN: probabilistic 3D human motion prediction via GAN. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops, pp. 1418\u20131427 (2018)","DOI":"10.1109\/CVPRW.2018.00191"},{"key":"23_CR10","doi-asserted-by":"crossref","unstructured":"Brooks, T., Holynski, A., Efros, A.A.: InstructPix2Pix: learning to follow image editing instructions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18392\u201318402 (2023)","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"23_CR11","unstructured":"Cai, Z., et\u00a0al.: Digital life project: autonomous 3D characters with social intelligence. arXiv preprint arXiv:2312.04547 (2023)"},{"key":"23_CR12","doi-asserted-by":"crossref","unstructured":"Castillo, A., et al.: BoDiffusion: diffusing sparse observations for full-body human motion synthesis (2023)","DOI":"10.1109\/ICCVW60793.2023.00456"},{"key":"23_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"356","DOI":"10.1007\/978-3-031-19790-1_22","volume-title":"Computer Vision \u2013 ECCV 2022","author":"P Cervantes","year":"2022","unstructured":"Cervantes, P., Sekikawa, Y., Sato, I., Shinoda, K.: Implicit neural representations for variable length human motion generation. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13677, pp. 356\u2013372. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19790-1_22"},{"key":"23_CR14","doi-asserted-by":"crossref","unstructured":"Chen, L.H., Zhang, J., Li, Y., Pang, Y., Xia, X., Liu, T.: HumanMAC: masked motion completion for human motion prediction (2023)","DOI":"10.1109\/ICCV51070.2023.00875"},{"key":"23_CR15","doi-asserted-by":"crossref","unstructured":"Chen, X., et al.: Executing your commands via motion diffusion in latent space. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18000\u201318010 (2023)","DOI":"10.1109\/CVPR52729.2023.01726"},{"key":"23_CR16","doi-asserted-by":"crossref","unstructured":"Chopin, B., Tang, H., Daoudi, M.: Bipartite graph diffusion model for human interaction generation. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 5333\u20135342 (2024)","DOI":"10.1109\/WACV57701.2024.00525"},{"key":"23_CR17","unstructured":"Couairon, G., Verbeek, J., Schwenk, H., Cord, M.: DiffEdit: diffusion-based semantic image editing with mask guidance. In: The Eleventh International Conference on Learning Representations (2022)"},{"key":"23_CR18","doi-asserted-by":"crossref","unstructured":"Dabral, R., Mughal, M.H., Golyanik, V., Theobalt, C.: MoFusion: a framework for denoising-diffusion-based motion synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9760\u20139770 (2023)","DOI":"10.1109\/CVPR52729.2023.00941"},{"key":"23_CR19","doi-asserted-by":"crossref","unstructured":"Diller, C., Dai, A.: CG-HOI: contact-guided 3D human-object interaction generation. arXiv preprint arXiv:2311.16097 (2023)","DOI":"10.1109\/CVPR52733.2024.01880"},{"key":"23_CR20","doi-asserted-by":"crossref","unstructured":"Diller, C., Funkhouser, T., Dai, A.: Forecasting characteristic 3D poses of human actions (2022)","DOI":"10.1109\/CVPR52688.2022.01545"},{"key":"23_CR21","doi-asserted-by":"crossref","unstructured":"Du, Y., Kips, R., Pumarola, A., Starke, S., Thabet, A., Sanakoyeu, A.: Avatars grow legs: generating smooth human motion from sparse tracking inputs with diffusion model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 481\u2013490 (2023)","DOI":"10.1109\/CVPR52729.2023.00054"},{"key":"23_CR22","unstructured":"Gao, X., Hu, L., Zhang, P., Zhang, B., Bo, L.: DanceMeld: unraveling dance phrases with hierarchical latent codes for music-to-dance synthesis. arXiv preprint arXiv:2401.10242 (2023)"},{"key":"23_CR23","doi-asserted-by":"crossref","unstructured":"Ghorbani, S., Ferstl, Y., Holden, D., Troje, N.F., Carbonneau, M.A.: ZeroEGGS: zero-shot example-based gesture generation from speech. In: Computer Graphics Forum, vol.\u00a042, pp. 206\u2013216. Wiley Online Library (2023)","DOI":"10.1111\/cgf.14734"},{"key":"23_CR24","doi-asserted-by":"crossref","unstructured":"Ghosh, A., Cheema, N., Oguz, C., Theobalt, C., Slusallek, P.: Synthesis of compositional animations from textual descriptions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1396\u20131406 (2021)","DOI":"10.1109\/ICCV48922.2021.00143"},{"key":"23_CR25","doi-asserted-by":"crossref","unstructured":"Ghosh, A., Cheema, N., Oguz, C., Theobalt, C., Slusallek, P.: Text-based motion synthesis with a hierarchical two-stream RNN. In: ACM SIGGRAPH 2021 Posters, pp.\u00a01\u20132 (2021)","DOI":"10.1145\/3450618.3469163"},{"key":"23_CR26","doi-asserted-by":"crossref","unstructured":"Ghosh, A., Dabral, R., Golyanik, V., Theobalt, C., Slusallek, P.: ReMoS: reactive 3D motion synthesis for two-person interactions. arXiv preprint arXiv:2311.17057 (2023)","DOI":"10.1007\/978-3-031-72764-1_24"},{"key":"23_CR27","doi-asserted-by":"crossref","unstructured":"Girdhar, R., et al.: ImageBind: one embedding space to bind them all. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15180\u201315190 (2023)","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"23_CR28","doi-asserted-by":"crossref","unstructured":"Goel, P., Wang, K.C., Liu, C.K., Fatahalian, K.: Iterative motion editing with natural language. arXiv preprint arXiv:2312.11538 (2023)","DOI":"10.1145\/3641519.3657447"},{"key":"23_CR29","doi-asserted-by":"crossref","unstructured":"Gong, K., et al.: TM2D: bimodality driven 3D dance generation via music-text integration. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9942\u20139952 (2023)","DOI":"10.1109\/ICCV51070.2023.00912"},{"key":"23_CR30","doi-asserted-by":"crossref","unstructured":"Gopalakrishnan, A., Mali, A., Kifer, D., Giles, L., Ororbia, A.G.: A neural temporal model for human motion prediction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12116\u201312125 (2019)","DOI":"10.1109\/CVPR.2019.01239"},{"key":"23_CR31","doi-asserted-by":"crossref","unstructured":"Guo, C., Mu, Y., Javed, M.G., Wang, S., Cheng, L.: MoMask: generative masked modeling of 3D human motions. arXiv preprint arXiv:2312.00063 (2023)","DOI":"10.1109\/CVPR52733.2024.00186"},{"key":"23_CR32","doi-asserted-by":"crossref","unstructured":"Guo, C., et al.: Generating diverse and natural 3D human motions from text. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5152\u20135161 (2022)","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"23_CR33","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"580","DOI":"10.1007\/978-3-031-19833-5_34","volume-title":"Computer Vision \u2013 ECCV 2022","author":"C Guo","year":"2022","unstructured":"Guo, C., Zuo, X., Wang, S., Cheng, L.: TM2T: stochastic and tokenized modeling for the reciprocal generation of 3D human motions and texts. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13695, pp. 580\u2013597. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19833-5_34"},{"key":"23_CR34","doi-asserted-by":"crossref","unstructured":"Guo, C., et al.: Action2Motion: conditioned generation of 3D human motions. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 2021\u20132029 (2020)","DOI":"10.1145\/3394171.3413635"},{"key":"23_CR35","doi-asserted-by":"crossref","unstructured":"Guo, W., Du, Y., Shen, X., Lepetit, V., Alameda-Pineda, X., Moreno-Noguer, F.: Back to MLP: a simple baseline for human motion prediction. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 4809\u20134819 (2023)","DOI":"10.1109\/WACV56688.2023.00479"},{"key":"23_CR36","unstructured":"Han, B., et al.: AMD autoregressive motion diffusion. arXiv preprint arXiv:2305.09381 (2023)"},{"key":"23_CR37","doi-asserted-by":"crossref","unstructured":"Hao, Y., Zhang, J., Zhuo, T., Wen, F., Fan, H.: Hand-centric motion refinement for 3D hand-object interaction via hierarchical spatial-temporal modeling. arXiv preprint arXiv:2401.15987 (2024)","DOI":"10.1609\/aaai.v38i3.27979"},{"key":"23_CR38","unstructured":"He, X., Huang, S., Zhan, X., Wen, C., Shan, Y.: SemanticBoost: elevating motion generation with augmented textual cues. arXiv preprint arXiv:2310.20323 (2023)"},{"key":"23_CR39","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., Cohen-Or, D.: Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626 (2022)"},{"key":"23_CR40","doi-asserted-by":"crossref","unstructured":"Hoang, N.M., Gong, K., Guo, C., Mi, M.B.: MotionMix: weakly-supervised diffusion for controllable motion generation. arXiv preprint arXiv:2401.11115 (2024)","DOI":"10.1609\/aaai.v38i3.27988"},{"issue":"4","key":"23_CR41","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3528223.3530094","volume":"41","author":"F Hong","year":"2022","unstructured":"Hong, F., Zhang, M., Pan, L., Cai, Z., Yang, L., Liu, Z.: AvatarCLIP: zero-shot text-driven generation and animation of 3D avatars. ACM Trans. Graph. (TOG) 41(4), 1\u201319 (2022)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"23_CR42","unstructured":"Hu, V.T., et al.: Motion flow matching for human motion synthesis and editing. arXiv preprint arXiv:2312.08895 (2023)"},{"key":"23_CR43","unstructured":"Huang, R., Hu, H., Wu, W., Sawada, K., Zhang, M., Jiang, D.: Dance revolution: long-term dance generation with music via curriculum learning. arXiv preprint arXiv:2006.06119 (2020)"},{"key":"23_CR44","doi-asserted-by":"crossref","unstructured":"Huang, S., Wang, Z., Li, P., Jia, B., Liu, T., Zhu, Y., Liang, W., Zhu, S.C.: Diffusion-based generation, optimization, and planning in 3D scenes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16750\u201316761 (2023)","DOI":"10.1109\/CVPR52729.2023.01607"},{"issue":"7","key":"23_CR45","doi-asserted-by":"publisher","first-page":"1325","DOI":"10.1109\/TPAMI.2013.248","volume":"36","author":"C Ionescu","year":"2013","unstructured":"Ionescu, C., Papava, D., Olaru, V., Sminchisescu, C.: Human3.6M: large scale datasets and predictive methods for 3d human sensing in natural environments. IEEE Trans. Pattern Anal. Mach. Intell. 36(7), 1325\u20131339 (2013)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"23_CR46","doi-asserted-by":"crossref","unstructured":"Ji, Y., Xu, F., Yang, Y., Shen, F., Shen, H.T., Zheng, W.S.: A large-scale RGB-D database for arbitrary-view human action recognition. In: Proceedings of the 26th ACM International Conference on Multimedia, pp. 1510\u20131518 (2018)","DOI":"10.1145\/3240508.3240675"},{"key":"23_CR47","unstructured":"Jiang, B., Chen, X., Liu, W., Yu, J., Yu, G., Chen, T.: MotionGPT: human motion as a foreign language. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"23_CR48","doi-asserted-by":"crossref","unstructured":"Jiang, C., et\u00a0al.: MotionDiffuser: controllable multi-agent motion prediction using diffusion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9644\u20139653 (2023)","DOI":"10.1109\/CVPR52729.2023.00930"},{"key":"23_CR49","unstructured":"Jin, P., Wu, Y., Fan, Y., Sun, Z., Wei, Y., Yuan, L.: Act as you wish: fine-grained control of motion diffusion model with hierarchical semantic graphs. In: NeurIPS (2023)"},{"key":"23_CR50","doi-asserted-by":"crossref","unstructured":"Jing, B., Zhang, Y., Song, Z., Yu, J., Yang, W.: AMD: anatomical motion diffusion with interpretable motion decomposition and fusion. arXiv preprint arXiv:2312.12763 (2023)","DOI":"10.1609\/aaai.v38i3.28042"},{"key":"23_CR51","doi-asserted-by":"crossref","unstructured":"Kalakonda, S.S., Maheshwari, S., Sarvadevabhatla, R.K.: Action-GPT: leveraging large-scale language models for improved and generalized action generation. In: 2023 IEEE International Conference on Multimedia and Expo (ICME), pp. 31\u201336. IEEE (2023)","DOI":"10.1109\/ICME55011.2023.00014"},{"key":"23_CR52","doi-asserted-by":"crossref","unstructured":"Karunratanakul, K., Preechakul, K., Suwajanakorn, S., Tang, S.: Guided motion diffusion for controllable human motion synthesis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2151\u20132162 (2023)","DOI":"10.1109\/ICCV51070.2023.00205"},{"key":"23_CR53","doi-asserted-by":"crossref","unstructured":"Kawar, B., et al.: Imagic: text-based real image editing with diffusion models. In: Conference on Computer Vision and Pattern Recognition 2023 (2023)","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"23_CR54","doi-asserted-by":"crossref","unstructured":"Kim, G., Shim, H., Kim, H., Choi, Y., Kim, J., Yang, E.: Diffusion video autoencoders: toward temporally consistent face video editing via disentangled video encoding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6091\u20136100 (2023)","DOI":"10.1109\/CVPR52729.2023.00590"},{"key":"23_CR55","doi-asserted-by":"crossref","unstructured":"Kim, J., Kim, J., Choi, S.: FLAME: free-form language-based motion synthesis & editing. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a037, pp. 8255\u20138263 (2023)","DOI":"10.1609\/aaai.v37i7.25996"},{"key":"23_CR56","doi-asserted-by":"crossref","unstructured":"Kong, H., Gong, K., Lian, D., Mi, M.B., Wang, X.: Priority-centric human motion generation in discrete latent space. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 14806\u201314816 (2023)","DOI":"10.1109\/ICCV51070.2023.01360"},{"key":"23_CR57","doi-asserted-by":"crossref","unstructured":"Kucherenko, T., Hasegawa, D., Henter, G.E., Kaneko, N., Kjellstr\u00f6m, H.: Analyzing input and output representations for speech-driven gesture generation. In: Proceedings of the 19th ACM International Conference on Intelligent Virtual Agents, pp. 97\u2013104 (2019)","DOI":"10.1145\/3308532.3329472"},{"key":"23_CR58","doi-asserted-by":"publisher","DOI":"10.1080\/10447318.2021.1883883","author":"T Kucherenko","year":"2021","unstructured":"Kucherenko, T., Hasegawa, D., Kaneko, N., Henter, G.E., Kjellstr\u00f6m, H.: Moving fast and slow: analysis of representations and post-processing in speech-driven automatic gesture generation. Int. J. Hum.-Comput. Interact. (2021). https:\/\/doi.org\/10.1080\/10447318.2021.1883883","journal-title":"Int. J. Hum.-Comput. Interact."},{"key":"23_CR59","doi-asserted-by":"crossref","unstructured":"Kulal, S., Mao, J., Aiken, A., Wu, J.: Programmatic concept learning for human motion description and synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13843\u201313852 (2022)","DOI":"10.1109\/CVPR52688.2022.01347"},{"key":"23_CR60","doi-asserted-by":"crossref","unstructured":"Kulkarni, N., et al.: NIFTY: neural object interaction fields for guided human motion synthesis. arXiv preprint arXiv:2307.07511 (2023)","DOI":"10.1109\/CVPR52733.2024.00096"},{"key":"23_CR61","unstructured":"Lee, H.Y., et al.: Dancing to music. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"23_CR62","doi-asserted-by":"crossref","unstructured":"Li, B., Zhao, Y., Shi, Z., Sheng, L.: DanceFormer: music conditioned 3D dance generation with parametric motion transformer. In: AAAI (2022)","DOI":"10.1609\/aaai.v36i2.20014"},{"key":"23_CR63","doi-asserted-by":"crossref","unstructured":"Li, J., Clegg, A., Mottaghi, R., Wu, J., Puig, X., Liu, C.K.: Controllable human-object interaction synthesis. arXiv preprint arXiv:2312.03913 (2023)","DOI":"10.1007\/978-3-031-72940-9_4"},{"issue":"6","key":"23_CR64","first-page":"1","volume":"42","author":"J Li","year":"2023","unstructured":"Li, J., Wu, J., Liu, C.K.: Object motion guided human motion synthesis. ACM Trans. Graph. (TOG) 42(6), 1\u201311 (2023)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"23_CR65","unstructured":"Li, J., et al.: Learning to generate diverse dance motions with transformer. arXiv preprint arXiv:2008.08171 (2020)"},{"key":"23_CR66","doi-asserted-by":"crossref","unstructured":"Li, R., Yang, S., Ross, D.A., Kanazawa, A.: AI choreographer: music conditioned 3D dance generation with AIST++. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13401\u201313412 (2021)","DOI":"10.1109\/ICCV48922.2021.01315"},{"key":"23_CR67","doi-asserted-by":"crossref","unstructured":"Li, S., Zhuang, S., Song, W., Zhang, X., Chen, H., Hao, A.: Sequential texts driven cohesive motions synthesis with natural transitions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9498\u20139508 (2023)","DOI":"10.1109\/ICCV51070.2023.00871"},{"key":"23_CR68","unstructured":"Li, S., Singh, H., Grover, A.: Instructany2Pix: flexible visual editing via multimodal instruction following. arXiv preprint arXiv:2312.06738 (2023)"},{"key":"23_CR69","doi-asserted-by":"crossref","unstructured":"Li, T., Bolkart, T., Black, M.J., Li, H., Romero, J.: Learning a model of facial shape and expression from 4D scans. ACM Trans. Graph. 36(6), 194-1 (2017)","DOI":"10.1145\/3130800.3130813"},{"key":"23_CR70","doi-asserted-by":"crossref","unstructured":"Li, W., Xu, X., Liu, J., Xiao, X.: UNIMO-G: Unified image generation through multimodal conditional diffusion. arXiv preprint arXiv:2401.13388 (2024)","DOI":"10.18653\/v1\/2024.acl-long.335"},{"key":"23_CR71","doi-asserted-by":"crossref","unstructured":"Liang, H., Zhang, W., Li, W., Yu, J., Xu, L.: InterGen: diffusion-based multi-human motion generation under complex interactions. arXiv preprint arXiv:2304.05684 (2023)","DOI":"10.1007\/s11263-024-02042-6"},{"key":"23_CR72","unstructured":"Liang, Z., Li, Z., Zhou, S., Li, C., Loy, C.C.: Control color: multimodal diffusion-based interactive image colorization. arXiv preprint arXiv:2402.10855 (2024)"},{"key":"23_CR73","doi-asserted-by":"crossref","unstructured":"Lim, D., Jeong, C., Kim, Y.M.: MAMMOS: mapping multiple human motion with scene understanding and natural interactions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4278\u20134287 (2023)","DOI":"10.1109\/ICCVW60793.2023.00462"},{"key":"23_CR74","unstructured":"Lin, A.S., Wu, L., Corona, R., Tai, K., Huang, Q., Mooney, R.J.: Generating animated videos of human activities from natural language descriptions. In: NeurIPS Workshop (2018)"},{"key":"23_CR75","unstructured":"Lin, J., et al.: Motion-X: a large-scale 3D expressive whole-body human motion dataset. In: Advances in Neural Information Processing Systems (2023)"},{"key":"23_CR76","unstructured":"Lin, J., et al.: OHMG: zero-shot open-vocabulary human motion generation. arXiv preprint arXiv:2210.15929 (2022)"},{"key":"23_CR77","doi-asserted-by":"crossref","unstructured":"Lin, J., et al.: Being comes from not-being: Open-vocabulary text-to-motion generation with wordless training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23222\u201323231 (2023)","DOI":"10.1109\/CVPR52729.2023.02224"},{"key":"23_CR78","unstructured":"Lin, P., et al.: HandDiffuse: generative controllers for two-hand interactions via diffusion models. arXiv preprint arXiv:2312.04867 (2023)"},{"key":"23_CR79","doi-asserted-by":"crossref","unstructured":"Ling, Z., Han, B., Wong, Y., Kangkanhalli, M., Geng, W.: MCM: multi-condition motion synthesis framework for multi-scenario. arXiv preprint arXiv:2309.03031 (2023)","DOI":"10.24963\/ijcai.2024\/120"},{"key":"23_CR80","unstructured":"Liu, C., Zhao, M., Ren, B., Liu, M., Sebe, N., et\u00a0al.: Spatio-temporal graph diffusion for text-driven human motion generation. In: British Machine Vision Conference (2023)"},{"key":"23_CR81","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"612","DOI":"10.1007\/978-3-031-20071-7_36","volume-title":"Computer Vision \u2013 ECCV 2022","author":"H Liu","year":"2022","unstructured":"Liu, H., et al.: BEAT: a large-scale semantic and emotional multi-modal dataset for conversational gestures synthesis. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13667, pp. 612\u2013630. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20071-7_36"},{"key":"23_CR82","doi-asserted-by":"crossref","unstructured":"Liu, J., Dai, W., Wang, C., Cheng, Y., Tang, Y., Tong, X.: Plan, posture and go: towards open-world text-to-motion generation. arXiv preprint arXiv:2312.14828 (2023)","DOI":"10.1007\/978-3-031-73383-3_26"},{"issue":"10","key":"23_CR83","doi-asserted-by":"publisher","first-page":"2684","DOI":"10.1109\/TPAMI.2019.2916873","volume":"42","author":"J Liu","year":"2019","unstructured":"Liu, J., Shahroudy, A., Perez, M., Wang, G., Duan, L.Y., Kot, A.C.: NTU RGB+ D 120: a large-scale benchmark for 3D human activity understanding. IEEE Trans. Pattern Anal. Mach. Intell. 42(10), 2684\u20132701 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"23_CR84","doi-asserted-by":"crossref","unstructured":"Liu, X., et al.: Learning hierarchical cross-modal association for co-speech gesture generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10462\u201310472 (2022)","DOI":"10.1109\/CVPR52688.2022.01021"},{"key":"23_CR85","unstructured":"Liu, X., Chen, G., Tang, Y., Wang, G., Lim, S.N.: Language-free compositional action generation via decoupling refinement. arXiv preprint arXiv:2307.03538 (2023)"},{"key":"23_CR86","doi-asserted-by":"crossref","unstructured":"Liu, X., Hou, H., Yang, Y., Li, Y.L., Lu, C.: Revisit human-scene interaction via space occupancy. arXiv preprint arXiv:2312.02700 (2023)","DOI":"10.1007\/978-3-031-72973-7_1"},{"key":"23_CR87","unstructured":"Liu, Y., Chen, C., Yi, L.: Interactive humanoid: online full-body motion reaction synthesis with social affordance canonicalization and forecasting. arXiv preprint arXiv:2312.08983 (2023)"},{"issue":"6","key":"23_CR88","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2816795.2818013","volume":"34","author":"M Loper","year":"2015","unstructured":"Loper, M., Mahmood, N., Romero, J., Pons-Moll, G., Black, M.J.: SMPL: a skinned multi-person linear model. ACM trans. graph. (TOG) 34(6), 1\u201316 (2015)","journal-title":"ACM trans. graph. (TOG)"},{"key":"23_CR89","doi-asserted-by":"crossref","unstructured":"Loper, M., Mahmood, N., Romero, J., Pons-Moll, G., Black, M.J.: SMPL: a skinned multi-person linear model. ACM Trans. Graph. (Proc. SIGGRAPH Asia) 34(6), 248:1\u2013248:16 (2015)","DOI":"10.1145\/2816795.2818013"},{"key":"23_CR90","unstructured":"Lou, Y., Zhu, L., Wang, Y., Wang, X., Yang, Y.: DiverseMotion: towards diverse human motion generation via discrete diffusion. arXiv preprint arXiv:2309.01372 (2023)"},{"key":"23_CR91","unstructured":"Lu, S., et al.: HumanTOMATO: text-aligned whole-body motion generation. arXiv preprint arXiv:2310.12978 (2023)"},{"key":"23_CR92","unstructured":"Ma, J., Bai, S., Zhou, C.: Pretrained diffusion models for unified human motion synthesis. arXiv preprint arXiv:2212.02837 (2022)"},{"key":"23_CR93","doi-asserted-by":"crossref","unstructured":"Mahmood, N., Ghorbani, N., Troje, N.F., Pons-Moll, G., Black, M.J.: AMASS: archive of motion capture as surface shapes. In: International Conference on Computer Vision, pp. 5442\u20135451 (2019)","DOI":"10.1109\/ICCV.2019.00554"},{"key":"23_CR94","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"474","DOI":"10.1007\/978-3-030-58568-6_28","volume-title":"Computer Vision \u2013 ECCV 2020","author":"W Mao","year":"2020","unstructured":"Mao, W., Liu, M., Salzmann, M.: History repeats itself: human motion prediction via motion attention. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020, Part XIV. LNCS, vol. 12359, pp. 474\u2013489. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58568-6_28"},{"key":"23_CR95","doi-asserted-by":"crossref","unstructured":"Mao, W., Liu, M., Salzmann, M., Li, H.: Learning trajectory dependencies for human motion prediction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9489\u20139497 (2019)","DOI":"10.1109\/ICCV.2019.00958"},{"key":"23_CR96","doi-asserted-by":"crossref","unstructured":"von Marcard, T., Henschel, R., Black, M., Rosenhahn, B., Pons-Moll, G.: Recovering accurate 3D human pose in the wild using IMUs and a moving camera. In: European Conference on Computer Vision (ECCV) (sep 2018)","DOI":"10.1007\/978-3-030-01249-6_37"},{"key":"23_CR97","doi-asserted-by":"publisher","unstructured":"Mehta, D., et al.: Monocular 3D human pose estimation in the wild using improved cnn supervision. In: 3D Vision (3DV), 2017 Fifth International Conference on. IEEE (2017). https:\/\/doi.org\/10.1109\/3dv.2017.00064, http:\/\/gvv.mpi-inf.mpg.de\/3dhp_dataset","DOI":"10.1109\/3dv.2017.00064"},{"key":"23_CR98","unstructured":"Nguyen, T., Li, Y., Ojha, U., Lee, Y.J.: Visual instruction inversion: image editing via visual prompting. In: Thirty-seventh Conference on Neural Information Processing Systems (2023). https:\/\/openreview.net\/forum?id=l9BsCh8ikK"},{"key":"23_CR99","unstructured":"Okamura, M., Kondo, N., Sakamoto, T.F.M., Ochiai, Y.: Dance generation by sound symbolic words. arXiv preprint arXiv:2306.03646 (2023)"},{"key":"23_CR100","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., et al.: Expressive body capture: 3D hands, face, and body from a single image. In: Proceedings IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.01123"},{"key":"23_CR101","unstructured":"Peng, X., Xie, Y., Wu, Z., Jampani, V., Sun, D., Jiang, H.: HOI-Diff: text-driven synthesis of 3D human-object interactions using diffusion models. arXiv preprint arXiv:2312.06553 (2023)"},{"key":"23_CR102","doi-asserted-by":"crossref","unstructured":"Petrovich, M., Black, M.J., Varol, G.: Action-conditioned 3D human motion synthesis with transformer VAE. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10985\u201310995 (2021)","DOI":"10.1109\/ICCV48922.2021.01080"},{"key":"23_CR103","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"480","DOI":"10.1007\/978-3-031-20047-2_28","volume-title":"Computer Vision \u2013 ECCV 2022","author":"M Petrovich","year":"2022","unstructured":"Petrovich, M., Black, M.J., Varol, G.: TEMOS: generating diverse human motions from textual descriptions. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13682, pp. 480\u2013497. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20047-2_28"},{"key":"23_CR104","doi-asserted-by":"crossref","unstructured":"Petrovich, M., et al.: Multi-track timeline control for text-driven 3D human motion generation. arXiv preprint arXiv:2401.08559 (2024)","DOI":"10.1109\/CVPRW63382.2024.00197"},{"key":"23_CR105","doi-asserted-by":"crossref","unstructured":"Pi, H., Peng, S., Yang, M., Zhou, X., Bao, H.: Hierarchical generation of human-object interactions with diffusion probabilistic models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15061\u201315073 (2023)","DOI":"10.1109\/ICCV51070.2023.01383"},{"key":"23_CR106","doi-asserted-by":"crossref","unstructured":"Pinyoanuntapong, E., Wang, P., Lee, M., Chen, C.: MMM: generative masked motion model. arXiv preprint arXiv:2312.03596 (2023)","DOI":"10.1109\/CVPR52733.2024.00153"},{"issue":"4","key":"23_CR107","doi-asserted-by":"publisher","first-page":"236","DOI":"10.1089\/big.2016.0028","volume":"4","author":"M Plappert","year":"2016","unstructured":"Plappert, M., Mandery, C., Asfour, T.: The kit motion-language dataset. Big Data 4(4), 236\u2013252 (2016)","journal-title":"Big Data"},{"key":"23_CR108","doi-asserted-by":"crossref","unstructured":"Punnakkal, A.R., Chandrasekaran, A., Athanasiou, N., Quiros-Ramirez, A., Black, M.J.: BABEL: bodies, action and behavior with English labels. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 722\u2013731 (2021)","DOI":"10.1109\/CVPR46437.2021.00078"},{"key":"23_CR109","doi-asserted-by":"crossref","unstructured":"Qi, Q., et al.: DiffDance: cascaded human motion diffusion model for dance generation. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 1374\u20131382 (2023)","DOI":"10.1145\/3581783.3612307"},{"key":"23_CR110","doi-asserted-by":"crossref","unstructured":"Qian, Y., Urbanek, J., Hauptmann, A.G., Won, J.: Breaking the limits of text-conditioned 3D motion synthesis with elaborative descriptions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2306\u20132316 (2023)","DOI":"10.1109\/ICCV51070.2023.00219"},{"key":"23_CR111","doi-asserted-by":"crossref","unstructured":"Qing, Z., Cai, Z., Yang, Z., Yang, L.: Story-to-motion: synthesizing infinite and controllable character animation from long text. In: SIGGRAPH Asia 2023 Technical Communications, SA Technical Communications 2023, Sydney, NSW, Australia, 12\u201315 December 2023, pp. 28:1\u201328:4 (2023)","DOI":"10.1145\/3610543.3626176"},{"key":"23_CR112","unstructured":"Raab, S., Leibovitch, I., Tevet, G., Arar, M., Bermano, A.H., Cohen-Or, D.: Single motion diffusion. arXiv preprint arXiv:2302.05905 (2023)"},{"key":"23_CR113","unstructured":"Ren, J., Zhang, M., Yu, C., Ma, X., Pan, L., Liu, Z.: InsActor: instruction-driven physics-based characters. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"23_CR114","doi-asserted-by":"crossref","unstructured":"Ribeiro-Gomes, J., et al.: MotionGPT: human motion synthesis with improved diversity and realism via GPT-3 prompting. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 5070\u20135080 (2024)","DOI":"10.1109\/WACV57701.2024.00499"},{"key":"23_CR115","doi-asserted-by":"crossref","unstructured":"Ruan, L., et al.: MM-diffusion: learning multi-modal diffusion models for joint audio and video generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10219\u201310228 (2023)","DOI":"10.1109\/CVPR52729.2023.00985"},{"key":"23_CR116","unstructured":"Shafir, Y., Tevet, G., Kapon, R., Bermano, A.H.: Human motion diffusion as a generative prior. arXiv preprint arXiv:2303.01418 (2023)"},{"key":"23_CR117","doi-asserted-by":"crossref","unstructured":"Shahroudy, A., Liu, J., Ng, T.T., Wang, G.: NTU RGB+ D: a large scale dataset for 3D human activity analysis. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1010\u20131019 (2016)","DOI":"10.1109\/CVPR.2016.115"},{"key":"23_CR118","unstructured":"Shi, X., Luo, C., Peng, J., Zhang, H., Sun, Y.: Generating fine-grained human motions using ChatGPT-refined descriptions. arXiv preprint arXiv:2312.02772 (2023)"},{"key":"23_CR119","doi-asserted-by":"crossref","unstructured":"Shimada, S., et al.: MACS: mass conditioned 3D hand and object motion synthesis. arXiv preprint arXiv:2312.14929 (2023)","DOI":"10.1109\/3DV62453.2024.00082"},{"key":"23_CR120","unstructured":"Siyao, L., et al.: Duolando: follower GPT with off-policy reinforcement learning for dance accompaniment. In: The Twelfth International Conference on Learning Representations (2023)"},{"key":"23_CR121","doi-asserted-by":"crossref","unstructured":"Siyao, L., et al.: Bailando: 3D dance generation by actor-critic GPT with choreographic memory. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11050\u201311059 (2022)","DOI":"10.1109\/CVPR52688.2022.01077"},{"key":"23_CR122","doi-asserted-by":"publisher","first-page":"497","DOI":"10.1109\/TMM.2020.2981989","volume":"23","author":"G Sun","year":"2020","unstructured":"Sun, G., Wong, Y., Cheng, Z., Kankanhalli, M.S., Geng, W., Li, X.: DeepDance: music-to-dance motion choreography with adversarial learning. IEEE Trans. Multimedia 23, 497\u2013509 (2020)","journal-title":"IEEE Trans. Multimedia"},{"key":"23_CR123","unstructured":"Sun, J., Lin, Z., Han, X., Hu, J.F., Xu, J., Zheng, W.S.: Action-guided 3D human motion prediction. In: Advances in Neural Information Processing Systems, vol. 34, pp. 30169\u201330180 (2021)"},{"key":"23_CR124","doi-asserted-by":"crossref","unstructured":"Sun, J., Chowdhary, G.: Towards globally consistent stochastic human motion prediction via motion diffusion. arXiv preprint arXiv:2305.12554 (2023)","DOI":"10.1007\/978-3-031-73036-8_2"},{"key":"23_CR125","doi-asserted-by":"crossref","unstructured":"Tanaka, M., Fujiwara, K.: Role-aware interaction generation from textual description. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15999\u201316009 (2023)","DOI":"10.1109\/ICCV51070.2023.01466"},{"key":"23_CR126","doi-asserted-by":"crossref","unstructured":"Tendulkar, P., Sur\u00eds, D., Vondrick, C.: FLEX: full-body grasping without full-body grasps. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21179\u201321189 (2023)","DOI":"10.1109\/CVPR52729.2023.02029"},{"key":"23_CR127","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"358","DOI":"10.1007\/978-3-031-20047-2_21","volume-title":"Computer Vision \u2013 ECCV 2022","author":"G Tevet","year":"2022","unstructured":"Tevet, G., Gordon, B., Hertz, A., Bermano, A.H., Cohen-Or, D.: MotionCLIP: exposing human motion generation to clip space. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13682, pp. 358\u2013374. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20047-2_21"},{"key":"23_CR128","unstructured":"Tevet, G., Raab, S., Gordon, B., Shafir, Y., Cohen-or, D., Bermano, A.H.: Human motion diffusion model. In: The Eleventh International Conference on Learning Representations (2022)"},{"key":"23_CR129","doi-asserted-by":"crossref","unstructured":"Tseng, J., Castellon, R., Liu, K.: EDGE: editable dance generation from music. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 448\u2013458 (2023)","DOI":"10.1109\/CVPR52729.2023.00051"},{"key":"23_CR130","doi-asserted-by":"crossref","unstructured":"Voas, J., Wang, Y., Huang, Q., Mooney, R.: What is the best automated metric for text to motion generation? In: SIGGRAPH Asia 2023 Conference Papers, pp. 1\u201311 (2023)","DOI":"10.1145\/3610548.3618185"},{"key":"23_CR131","unstructured":"Voleti, V., Jolicoeur-Martineau, A., Pal, C.: MCVD-masked conditional video diffusion for prediction, generation, and interpolation. In: Advances in Neural Information Processing Systems, vol. 35, pp. 23371\u201323385 (2022)"},{"key":"23_CR132","unstructured":"Wan, W., Dou, Z., Komura, T., Wang, W., Jayaraman, D., Liu, L.: TLControl: trajectory and language control for human motion synthesis. arXiv preprint arXiv:2311.17135 (2023)"},{"key":"23_CR133","doi-asserted-by":"crossref","unstructured":"Wang, X., Cui, Q., Chen, C., Liu, M.: GCNext: towards the unity of graph convolutions for human motion prediction. arXiv preprint arXiv:2312.11850 (2023)","DOI":"10.1609\/aaai.v38i6.28375"},{"key":"23_CR134","doi-asserted-by":"crossref","unstructured":"Wang, Y., Leng, Z., Li, F.W., Wu, S.C., Liang, X.: Fg-T2M: fine-grained text-driven human motion generation via diffusion model. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 22035\u201322044 (2023)","DOI":"10.1109\/ICCV51070.2023.02014"},{"key":"23_CR135","unstructured":"Wang, Y., Lin, J., Zeng, A., Luo, Z., Zhang, J., Zhang, L.: PhysHOI: physics-based imitation of dynamic human-object interaction. arXiv preprint arXiv:2312.04393 (2023)"},{"key":"23_CR136","doi-asserted-by":"crossref","unstructured":"Wang, Z., et al.: Learning diverse stochastic human-action generators by learning smooth latent transitions. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 12281\u201312288 (2020)","DOI":"10.1609\/aaai.v34i07.6911"},{"key":"23_CR137","doi-asserted-by":"crossref","unstructured":"Wei, D., et al.: Enhanced fine-grained motion diffusion for text-driven human motion synthesis (2023)","DOI":"10.1609\/aaai.v38i6.28401"},{"key":"23_CR138","unstructured":"Xiao, Z., et al.: Unified human-scene interaction via prompted chain-of-contacts. arXiv preprint arXiv:2309.07918 (2023)"},{"key":"23_CR139","unstructured":"Xie, Y., Jampani, V., Zhong, L., Sun, D., Jiang, H.: OmniControl: control any joint at any time for human motion generation. arXiv preprint arXiv:2310.08580 (2023)"},{"key":"23_CR140","doi-asserted-by":"crossref","unstructured":"Xie, Z., Wu, Y., Gao, X., Sun, Z., Yang, W., Liang, X.: Towards detailed text-to-motion synthesis via basic-to-advanced hierarchical diffusion model. arXiv preprint arXiv:2312.10960 (2023)","DOI":"10.1609\/aaai.v38i6.28443"},{"key":"23_CR141","doi-asserted-by":"crossref","unstructured":"Xu, Z., Zhang, Y., Yang, S., Li, R., Li, X.: Chain of generation: multi-modal gesture synthesis via cascaded conditional control. arXiv preprint arXiv:2312.15900 (2023)","DOI":"10.1609\/aaai.v38i6.28458"},{"key":"23_CR142","unstructured":"Yan, H., Hu, Z., Schmitt, S., Bulling, A.: GazeMoDiff: gaze-guided diffusion model for stochastic human motion prediction. arXiv preprint arXiv:2312.12090 (2023)"},{"key":"23_CR143","doi-asserted-by":"crossref","unstructured":"Yang, S., Zhou, Y., Liu, Z., , Loy, C.C.: Rerender a video: zero-shot text-guided video-to-video translation. In: ACM SIGGRAPH Asia 2023 Conference Proceedings (2023)","DOI":"10.1145\/3610548.3618160"},{"key":"23_CR144","unstructured":"Yang, S., Yang, Z., Wang, Z.: LongDanceDiff: long-term dance generation with conditional diffusion model. arXiv preprint arXiv:2308.11945 (2023)"},{"key":"23_CR145","doi-asserted-by":"crossref","unstructured":"Yang, Z., Su, B., Wen, J.R.: Synthesizing long-term human motions with diffusion models via coherent sampling. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 3954\u20133964 (2023)","DOI":"10.1145\/3581783.3611887"},{"key":"23_CR146","doi-asserted-by":"crossref","unstructured":"Yao, H., Song, Z., Zhou, Y., Ao, T., Chen, B., Liu, L.: MoConVQ: unified physics-based motion control via scalable discrete representations. arXiv preprint arXiv:2310.10198 (2023)","DOI":"10.1145\/3658137"},{"key":"23_CR147","doi-asserted-by":"crossref","unstructured":"Yao, S., Sun, M., Li, B., Yang, F., Wang, J., Zhang, R.: Dance with you: the diversity controllable dancer generation via diffusion models. In: Proceedings of the 31st ACM International Conference on Multimedia,D pp. 8504\u20138514 (2023)","DOI":"10.1145\/3581783.3612046"},{"key":"23_CR148","unstructured":"Yazdian, P.J., Liu, E., Cheng, L., Lim, A.: MotionScript: natural language descriptions for expressive 3D human motions. arXiv preprint arXiv:2312.12634 (2023)"},{"key":"23_CR149","doi-asserted-by":"crossref","unstructured":"Yin, L., et al.: EMoG: synthesizing emotive co-speech 3D gesture with diffusion model. arXiv preprint arXiv:2306.11496 (2023)","DOI":"10.2139\/ssrn.4818829"},{"issue":"6","key":"23_CR150","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3414685.3417838","volume":"39","author":"Y Yoon","year":"2020","unstructured":"Yoon, Y., et al.: Speech gesture generation from the trimodal context of text, audio, and speaker identity. ACM Trans. Graph. (TOG) 39(6), 1\u201316 (2020)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"23_CR151","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Song, J., Iqbal, U., Vahdat, A., Kautz, J.: PhysDiff: Physics-guided human motion diffusion model. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 16010\u201316021 (2023)","DOI":"10.1109\/ICCV51070.2023.01467"},{"key":"23_CR152","doi-asserted-by":"crossref","unstructured":"Zhai, Y., et al.: Language-guided human motion synthesis with atomic actions. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 5262\u20135271 (2023)","DOI":"10.1145\/3581783.3612289"},{"key":"23_CR153","doi-asserted-by":"crossref","unstructured":"Zhang, J., et al.: T2M-GPT: generating human motion from textual descriptions with discrete representations. arXiv preprint arXiv:2301.06052 (2023)","DOI":"10.1109\/CVPR52729.2023.01415"},{"key":"23_CR154","unstructured":"Zhang, J., et al.: TapMo: shape-aware motion generation of skeleton-free characters. arXiv preprint arXiv:2310.12678 (2023)"},{"key":"23_CR155","doi-asserted-by":"crossref","unstructured":"Zhang, M., et al.: MotionDiffuse: text-driven human motion generation with diffusion model. IEEE Trans. Pattern Anal. Mach. Intell. (2024)","DOI":"10.1109\/TPAMI.2024.3355414"},{"key":"23_CR156","doi-asserted-by":"crossref","unstructured":"Zhang, M., et al.: ReMoDiffuse: retrieval-augmented motion diffusion model. In: IEEE\/CVF International Conference on Computer Vision, ICCV 2023, Paris, France, 1\u20136 October 2023, pp. 364\u2013373 (2023)","DOI":"10.1109\/ICCV51070.2023.00040"},{"key":"23_CR157","unstructured":"Zhang, M., Li, H., Cai, Z., Ren, J., Yang, L., Liu, Z.: FineMoGen: fine-grained spatio-temporal motion generation and editing. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"23_CR158","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"518","DOI":"10.1007\/978-3-031-20065-6_30","volume-title":"Computer Vision \u2013 ECCV 2022","author":"X Zhang","year":"2022","unstructured":"Zhang, X., Bhatnagar, B.L., Starke, S., Guzov, V., Pons-Moll, G.: COUCH: towards controllable human-chair interactions. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13665, pp. 518\u2013535. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20065-6_30"},{"key":"23_CR159","unstructured":"Zhang, Y., et al.: MotionGPT: finetuned LLMs are general-purpose motion generators. arXiv preprint arXiv:2306.10900 (2023)"},{"key":"23_CR160","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Tsipidi, E., Schriber, S., Kapadia, M., Gross, M., Modi, A.: Generating animations from screenplays. In: Proceedings of the Eighth Joint Conference on Lexical and Computational Semantics, *SEM@NAACL-HLT 2019, Minneapolis, MN, USA, 6\u20137 June 2019, pp. 292\u2013307 (2019)","DOI":"10.18653\/v1\/S19-1032"},{"key":"23_CR161","unstructured":"Zhao, M., Liu, M., Ren, B., Dai, S., Sebe, N.: MoDiff: action-conditioned 3D motion generation with denoising diffusion probabilistic models. arXiv preprint arXiv:2301.03949 (2023)"},{"key":"23_CR162","doi-asserted-by":"crossref","unstructured":"Zhao, W., Hu, L., Zhang, S.: DiffuGesture: generating human gesture from two-person dialogue with diffusion models. In: Companion Publication of the 25th International Conference on Multimodal Interaction, pp. 179\u2013185 (2023)","DOI":"10.1145\/3610661.3616552"},{"key":"23_CR163","doi-asserted-by":"crossref","unstructured":"Zhi, Y., et al.: LivelySpeaker: towards semantic-aware co-speech gesture generation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 20807\u201320817 (2023)","DOI":"10.1109\/ICCV51070.2023.01902"},{"key":"23_CR164","doi-asserted-by":"crossref","unstructured":"Zhong, C., Hu, L., Zhang, Z., Xia, S.: AttT2M: text-driven human motion generation with multi-perspective attention mechanism. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 509\u2013519 (2023)","DOI":"10.1109\/ICCV51070.2023.00053"},{"key":"23_CR165","doi-asserted-by":"crossref","unstructured":"Zhou, W., et al.: EMDM: efficient motion diffusion model for fast, high-quality motion generation. arXiv preprint arXiv:2312.02256 (2023)","DOI":"10.1007\/978-3-031-72627-9_2"},{"key":"23_CR166","doi-asserted-by":"crossref","unstructured":"Zhou, Z., Wang, B.: UDE: A unified driving engine for human motion generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5632\u20135641 (2023)","DOI":"10.1109\/CVPR52729.2023.00545"},{"key":"23_CR167","doi-asserted-by":"crossref","unstructured":"Zhuang, W., Wang, C., Chai, J., Wang, Y., Shao, M., Xia, S.: Music2Dance: DanceNet for music-driven dance generation. ACM Trans. Multimed. Comput. Commun. Appl. (TOMM) 18(2), 1\u201321 (2022)","DOI":"10.1145\/3485664"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72624-8_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T07:49:17Z","timestamp":1732952957000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72624-8_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,26]]},"ISBN":["9783031726231","9783031726248"],"references-count":167,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72624-8_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,26]]},"assertion":[{"value":"26 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}