{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T17:39:34Z","timestamp":1770917974714,"version":"3.50.1"},"publisher-location":"Cham","reference-count":89,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031736490","type":"print"},{"value":"9783031736506","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T00:00:00Z","timestamp":1732147200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T00:00:00Z","timestamp":1732147200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73650-6_5","type":"book-chapter","created":{"date-parts":[[2024,11,20]],"date-time":"2024-11-20T18:16:15Z","timestamp":1732126575000},"page":"67-86","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Towards Open Domain Text-Driven Synthesis of\u00a0Multi-person Motions"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1520-5979","authenticated-orcid":false,"given":"Mengyi","family":"Shan","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4036-7690","authenticated-orcid":false,"given":"Lu","family":"Dong","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3867-6925","authenticated-orcid":false,"given":"Yutao","family":"Han","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5789-3554","authenticated-orcid":false,"given":"Yuan","family":"Yao","sequence":"additional","affiliation":[]},{"given":"Tao","family":"Liu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1414-6433","authenticated-orcid":false,"given":"Ifeoma","family":"Nwogu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3508-1851","authenticated-orcid":false,"given":"Guo-Jun","family":"Qi","sequence":"additional","affiliation":[]},{"given":"Mitch","family":"Hill","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,21]]},"reference":[{"key":"5_CR1","doi-asserted-by":"crossref","unstructured":"Van\u00a0der Aa, N., Luo, X., Giezeman, G.J., Tan, R.T., Veltkamp, R.C.: UMPM benchmark: a multi-person dataset with synchronized video and motion capture data for evaluation of articulated human motion and interaction. In: 2011 IEEE International Conference on Computer Vision Workshops (ICCV Workshops), pp. 1264\u20131269. IEEE (2011)","DOI":"10.1109\/ICCVW.2011.6130396"},{"key":"5_CR2","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: a joint video and image encoder for end-to-end retrieval. In: IEEE International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"5_CR3","doi-asserted-by":"crossref","unstructured":"Barsoum, E., Kender, J., Liu, Z.: HP-GAN: probabilistic 3D human motion prediction via GAN. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops, pp. 1418\u20131427 (2018)","DOI":"10.1109\/CVPRW.2018.00191"},{"key":"5_CR4","doi-asserted-by":"crossref","unstructured":"Chen, X., et al.: Executing your commands via motion diffusion in latent space. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18000\u201318010 (2023)","DOI":"10.1109\/CVPR52729.2023.01726"},{"key":"5_CR5","doi-asserted-by":"crossref","unstructured":"Choi, H., Moon, G., Chang, J.Y., Lee, K.M.: Beyond static features for temporally consistent 3D human pose and shape from a video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1964\u20131973 (2021)","DOI":"10.1109\/CVPR46437.2021.00200"},{"key":"5_CR6","doi-asserted-by":"crossref","unstructured":"Choi, H., Moon, G., Park, J., Lee, K.M.: Learning to estimate robust 3D human mesh from in-the-wild crowded scenes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1475\u20131484 (2022)","DOI":"10.1109\/CVPR52688.2022.00153"},{"key":"5_CR7","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning (2023)"},{"key":"5_CR8","unstructured":"Doersch, C., Zisserman, A.: Sim2real transfer learning for 3D human pose estimation: motion to the rescue. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"5_CR9","unstructured":"Duan, Y., et al.: Single-shot motion completion with transformer (2021)"},{"key":"5_CR10","unstructured":"Fieraru, M., Zanfir, M., Szente, T., Bazavan, E., Olaru, V., Sminchisescu, C.: REMIPS: physically consistent 3D reconstruction of multiple interacting people under weak supervision. In: Advances in Neural Information Processing Systems, vol. 34, pp. 19385\u201319397 (2021)"},{"key":"5_CR11","doi-asserted-by":"publisher","unstructured":"Fragkiadaki, K., Levine, S., Felsen, P., Malik, J.: Recurrent network models for human dynamics. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4346\u20134354 (2015). https:\/\/doi.org\/10.1109\/ICCV.2015.494","DOI":"10.1109\/ICCV.2015.494"},{"key":"5_CR12","doi-asserted-by":"crossref","unstructured":"Goel, S., Pavlakos, G., Rajasegaran, J., Kanazawa, A., Malik, J.: Humans in 4D: reconstructing and tracking humans with transformers. arXiv preprint arXiv:2305.20091 (2023)","DOI":"10.1109\/ICCV51070.2023.01358"},{"key":"5_CR13","doi-asserted-by":"crossref","unstructured":"Guo, C., et al.: Generating diverse and natural 3D human motions from text. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5152\u20135161 (2022)","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"5_CR14","series-title":"LNCS","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_34","volume-title":"ECCV 2022","author":"C Guo","year":"2022","unstructured":"Guo, C., Zuo, X., Wang, S., Cheng, L.: TM2T: stochastic and tokenized modeling for the reciprocal generation of 3D human motions and texts. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13695. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19833-5_34"},{"key":"5_CR15","doi-asserted-by":"crossref","unstructured":"Guo, W., Bie, X., Alameda-Pineda, X., Moreno-Noguer, F.: Multi-person extreme motion prediction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13053\u201313064 (2022)","DOI":"10.1109\/CVPR52688.2022.01271"},{"key":"5_CR16","unstructured":"Guo, Y., et al.: Animatediff: animate your personalized text-to-image diffusion models without specific tuning (2023)"},{"key":"5_CR17","doi-asserted-by":"publisher","unstructured":"Harvey, F.G., Pal, C.: Recurrent transition networks for character locomotion. In: ACM SIGGRAPH Asia 2018 Technical Briefs. Association for Computing Machinery, New York (2018). https:\/\/doi.org\/10.1145\/3283254.3283277","DOI":"10.1145\/3283254.3283277"},{"key":"5_CR18","doi-asserted-by":"publisher","unstructured":"Harvey, F.G., Yurick, M., Nowrouzezahrai, D., Pal, C.: Robust motion in-betweening. ACM Trans. Graph. (TOG)) 39(4) (2020). https:\/\/doi.org\/10.1145\/3386569.3392480","DOI":"10.1145\/3386569.3392480"},{"key":"5_CR19","doi-asserted-by":"publisher","unstructured":"Hernandez, A., Gall, J., Moreno, F.: Human motion prediction via spatio-temporal inpainting. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 7133\u20137142 (2019). https:\/\/doi.org\/10.1109\/ICCV.2019.00723","DOI":"10.1109\/ICCV.2019.00723"},{"key":"5_CR20","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. arXiv preprint arxiv:2006.11239 (2020)"},{"key":"5_CR21","unstructured":"Ho, J., Salimans, T., Gritsenko, A., Chan, W., Norouzi, M., Fleet, D.J.: Video diffusion models. arXiv:2204.03458 (2022)"},{"key":"5_CR22","unstructured":"Jiang, A.Q., et al.: Mistral 7b (2023)"},{"key":"5_CR23","unstructured":"Jiang, B., Chen, X., Liu, W., Yu, J., Yu, G., Chen, T.: MotionGPT: human motion as a foreign language. arXiv preprint arXiv:2306.14795 (2023)"},{"key":"5_CR24","doi-asserted-by":"crossref","unstructured":"Joo, H., Simon, T., Cikara, M., Sheikh, Y.: Towards social artificial intelligence: nonverbal social signal prediction in a triadic interaction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10873\u201310883 (2019)","DOI":"10.1109\/CVPR.2019.01113"},{"key":"5_CR25","doi-asserted-by":"crossref","unstructured":"Kanazawa, A., Black, M.J., Jacobs, D.W., Malik, J.: End-to-end recovery of human shape and pose. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7122\u20137131 (2018)","DOI":"10.1109\/CVPR.2018.00744"},{"key":"5_CR26","doi-asserted-by":"crossref","unstructured":"Kanazawa, A., Zhang, J.Y., Felsen, P., Malik, J.: Learning 3D human dynamics from video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5614\u20135623 (2019)","DOI":"10.1109\/CVPR.2019.00576"},{"key":"5_CR27","doi-asserted-by":"crossref","unstructured":"Khirodkar, R., Tripathi, S., Kitani, K.: Occluded human mesh recovery. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1715\u20131725 (2022)","DOI":"10.1109\/CVPR52688.2022.00176"},{"key":"5_CR28","doi-asserted-by":"crossref","unstructured":"Kocabas, M., Athanasiou, N., Black, M.J.: Vibe: video inference for human body pose and shape estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5253\u20135263 (2020)","DOI":"10.1109\/CVPR42600.2020.00530"},{"key":"5_CR29","doi-asserted-by":"crossref","unstructured":"Kocabas, M., Huang, C.H.P., Hilliges, O., Black, M.J.: Pare: part attention regressor for 3D human body estimation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11127\u201311137 (2021)","DOI":"10.1109\/ICCV48922.2021.01094"},{"key":"5_CR30","doi-asserted-by":"crossref","unstructured":"Kolotouros, N., Pavlakos, G., Black, M.J., Daniilidis, K.: Learning to reconstruct 3D human pose and shape via model-fitting in the loop. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2252\u20132261 (2019)","DOI":"10.1109\/ICCV.2019.00234"},{"key":"5_CR31","doi-asserted-by":"crossref","unstructured":"Kolotouros, N., Pavlakos, G., Daniilidis, K.: Convolutional mesh regression for single-image human shape reconstruction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4501\u20134510 (2019)","DOI":"10.1109\/CVPR.2019.00463"},{"key":"5_CR32","doi-asserted-by":"crossref","unstructured":"Kolotouros, N., Pavlakos, G., Jayaraman, D., Daniilidis, K.: Probabilistic modeling for human mesh recovery. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11605\u201311614 (2021)","DOI":"10.1109\/ICCV48922.2021.01140"},{"key":"5_CR33","doi-asserted-by":"crossref","unstructured":"Le, N., Pham, T., Do, T., Tjiputra, E., Tran, Q.D., Nguyen, A.: Music-driven group choreography. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8673\u20138682 (2023)","DOI":"10.1109\/CVPR52729.2023.00838"},{"key":"5_CR34","doi-asserted-by":"publisher","unstructured":"Li, P., Aberman, K., Zhang, Z., Hanocka, R., Sorkine-Hornung, O.: Ganimator: neural motion synthesis from a single sequence. ACM Trans. Graph. (TOG) 41(4), 1\u201312 (2022). https:\/\/doi.org\/10.1145\/3528223.3530157","DOI":"10.1145\/3528223.3530157"},{"key":"5_CR35","doi-asserted-by":"crossref","unstructured":"Li, Y., Takehara, H., Taketomi, T., Zheng, B., Nie\u00dfner, M.: 4dcomplete: non-rigid motion estimation beyond the observable surface. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 12706\u201312716 (2021)","DOI":"10.1109\/ICCV48922.2021.01247"},{"key":"5_CR36","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"590","DOI":"10.1007\/978-3-031-20065-6_34","volume-title":"ECCV 2022","author":"Z Li","year":"2022","unstructured":"Li, Z., Liu, J., Zhang, Z., Xu, S., Yan, Y.: CLIFF: carrying location information in full frames into human pose and shape estimation. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13665, pp. 590\u2013606. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20065-6_34"},{"key":"5_CR37","doi-asserted-by":"crossref","unstructured":"Li, Z., Xu, B., Huang, H., Lu, C., Guo, Y.: Deep two-stream video inference for human body pose and shape estimation. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 430\u2013439 (2022)","DOI":"10.1109\/WACV51458.2022.00071"},{"key":"5_CR38","doi-asserted-by":"crossref","unstructured":"Liang, H., Zhang, W., Li, W., Yu, J., Xu, L.: Intergen: diffusion-based multi-human motion generation under complex interactions. arXiv preprint arXiv:2304.05684 (2023)","DOI":"10.1007\/s11263-024-02042-6"},{"key":"5_CR39","doi-asserted-by":"crossref","unstructured":"Lin, K., Wang, L., Liu, Z.: Mesh graphormer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 12939\u201312948 (2021)","DOI":"10.1109\/ICCV48922.2021.01270"},{"issue":"10","key":"5_CR40","doi-asserted-by":"publisher","first-page":"2684","DOI":"10.1109\/TPAMI.2019.2916873","volume":"42","author":"J Liu","year":"2019","unstructured":"Liu, J., Shahroudy, A., Perez, M., Wang, G., Duan, L.Y., Kot, A.C.: NTU RGB+ d 120: a large-scale benchmark for 3D human activity understanding. IEEE Trans. Pattern Anal. Mach. Intell. 42(10), 2684\u20132701 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"5_CR41","doi-asserted-by":"crossref","unstructured":"Loper, M., Mahmood, N., Romero, J., Pons-Moll, G., Black, M.J.: SMPL: A Skinned Multi-Person Linear Model, 1st edn. Association for Computing Machinery, New York (2023). https:\/\/doi.org\/10.1145\/3596711.3596800","DOI":"10.1145\/3596711.3596800"},{"key":"5_CR42","unstructured":"Ma, J., Bai, S., Zhou, C.: Pretrained diffusion models for unified human motion synthesis. arXiv preprint arXiv:2212.02837 (2022)"},{"key":"5_CR43","doi-asserted-by":"crossref","unstructured":"Maheshwari, S., Gupta, D., Sarvadevabhatla, R.K.: MUGL: large scale multi person conditional action generation with locomotion. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 257\u2013265 (2022)","DOI":"10.1109\/WACV51458.2022.00082"},{"key":"5_CR44","doi-asserted-by":"crossref","unstructured":"Mahmood, N., Ghorbani, N., Troje, N.F., Pons-Moll, G., Black, M.J.: AMASS: archive of motion capture as surface shapes. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5442\u20135451 (2019)","DOI":"10.1109\/ICCV.2019.00554"},{"key":"5_CR45","doi-asserted-by":"crossref","unstructured":"Martinez, J., Black, M.J., Romero, J.: On human motion prediction using recurrent neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2891\u20132900 (2017)","DOI":"10.1109\/CVPR.2017.497"},{"key":"5_CR46","doi-asserted-by":"crossref","unstructured":"Mehta, D., et al.: Single-shot multi-person 3D pose estimation from monocular RGB. In: 2018 International Conference on 3D Vision (3DV), pp. 120\u2013130. IEEE (2018)","DOI":"10.1109\/3DV.2018.00024"},{"issue":"3","key":"5_CR47","doi-asserted-by":"publisher","first-page":"231","DOI":"10.1006\/cviu.2000.0897","volume":"81","author":"TB Moeslund","year":"2001","unstructured":"Moeslund, T.B., Granum, E.: A survey of computer vision-based human motion capture. Comput. Vis. Image Underst. 81(3), 231\u2013268 (2001). https:\/\/doi.org\/10.1006\/cviu.2000.0897","journal-title":"Comput. Vis. Image Underst."},{"key":"5_CR48","doi-asserted-by":"crossref","unstructured":"Ng, E., et al.: Learning to listen: modeling non-deterministic dyadic facial motion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 20395\u201320405 (2022)","DOI":"10.1109\/CVPR52688.2022.01975"},{"key":"5_CR49","doi-asserted-by":"crossref","unstructured":"Ng, E., et al.: From audio to photoreal embodiment: synthesizing humans in conversations. arXiv preprint arXiv:2401.01885 (2024)","DOI":"10.1109\/CVPR52733.2024.00101"},{"key":"5_CR50","doi-asserted-by":"crossref","unstructured":"Ng, E., Xiang, D., Joo, H., Grauman, K.: You2me: inferring body pose in egocentric video via first and second person interactions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9890\u20139900 (2020)","DOI":"10.1109\/CVPR42600.2020.00991"},{"key":"5_CR51","unstructured":"Petrovich, M., Black, M.J., Varol, G.: Action-conditioned 3D human motion synthesis with transformer VAE. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10985\u201310995"},{"key":"5_CR52","doi-asserted-by":"crossref","unstructured":"Petrovich, M., Black, M.J., Varol, G.: TEMOS: generating diverse human motions from textual descriptions. In: Proceedings of the European Conference on Computer Vision (2022)","DOI":"10.1007\/978-3-031-20047-2_28"},{"key":"5_CR53","doi-asserted-by":"publisher","unstructured":"Plappert, M., Mandery, C., Asfour, T.: The kit motion-language dataset. Big Data 4(4), 236\u2013252 (2016). https:\/\/doi.org\/10.1089\/big.2016.0028","DOI":"10.1089\/big.2016.0028"},{"key":"5_CR54","doi-asserted-by":"crossref","unstructured":"Punnakkal, A.R., Chandrasekaran, A., Athanasiou, N., Quiros-Ramirez, A., Black, M.J.: Babel: bodies, action and behavior with English labels. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 722\u2013731 (2021)","DOI":"10.1109\/CVPR46437.2021.00078"},{"key":"5_CR55","doi-asserted-by":"crossref","unstructured":"Qiu, Z., et al.: PSVT: end-to-end multi-person 3D pose and shape estimation with progressive video transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21254\u201321263 (2023)","DOI":"10.1109\/CVPR52729.2023.02036"},{"key":"5_CR56","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision (2021)"},{"key":"5_CR57","unstructured":"Rajasegaran, J., Pavlakos, G., Kanazawa, A., Malik, J.: Tracking people with 3D representations. arXiv preprint arXiv:2111.07868 (2021)"},{"key":"5_CR58","unstructured":"Schuhmann, C., et al.: LAION-400M: open dataset of clip-filtered 400 million image-text pairs abs\/2111.02114 (2021). https:\/\/arxiv.org\/abs\/2111.02114"},{"key":"5_CR59","unstructured":"Shafir, Y., Tevet, G., Kapon, R., Bermano, A.H.: Human motion diffusion as a generative prior. arXiv preprint arXiv:2303.01418 (2023)"},{"key":"5_CR60","doi-asserted-by":"crossref","unstructured":"Sun, Y., Bao, Q., Liu, W., Fu, Y., Michael,\u00a0J.B., Mei, T.: Monocular, one-stage, regression of multiple 3D people. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01099"},{"key":"5_CR61","doi-asserted-by":"crossref","unstructured":"Sun, Y., Bao, Q., Liu, W., Mei, T., Black, M.J.: TRACE: 5D temporal regression of avatars with dynamic cameras in 3D environments. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00855"},{"key":"5_CR62","doi-asserted-by":"crossref","unstructured":"Sun, Y., Liu, W., Bao, Q., Fu, Y., Mei, T., Black, M.J.: Putting people in their place: monocular regression of 3D people in depth. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01289"},{"key":"5_CR63","doi-asserted-by":"crossref","unstructured":"Sun, Y., Ye, Y., Liu, W., Gao, W., Fu, Y., Mei, T.: Human mesh recovery from monocular images via a skeleton-disentangled representation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5349\u20135358 (2019)","DOI":"10.1109\/ICCV.2019.00545"},{"key":"5_CR64","doi-asserted-by":"crossref","unstructured":"Tanaka, M., Fujiwara, K.: Role-aware interaction generation from textual description. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15999\u201316009 (2023)","DOI":"10.1109\/ICCV51070.2023.01466"},{"key":"5_CR65","doi-asserted-by":"crossref","unstructured":"Tanke, J., et al.: Social diffusion: long-term multiple human motion anticipation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9601\u20139611 (2023)","DOI":"10.1109\/ICCV51070.2023.00880"},{"key":"5_CR66","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"358","DOI":"10.1007\/978-3-031-20047-2_21","volume-title":"ECCV 2022","author":"G Tevet","year":"2022","unstructured":"Tevet, G., Gordon, B., Hertz, A., Bermano, A.H., Cohen-Or, D.: MotionCLIP: exposing human motion generation to CLIP space. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13682, pp. 358\u2013374. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20047-2_21"},{"key":"5_CR67","unstructured":"Tevet, G., Raab, S., Gordon, B., Shafir, Y., Cohen-or, D., Bermano, A.H.: Human motion diffusion model. In: Proceedings of the 11th International Conference on Learning Representations (2023)"},{"key":"5_CR68","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Guyon, I., et al. (eds.) Advances in Neural Information Processing Systems, vol.\u00a030 (2017)"},{"key":"5_CR69","doi-asserted-by":"crossref","unstructured":"Von\u00a0Marcard, T., Henschel, R., Black, M.J., Rosenhahn, B., Pons-Moll, G.: Recovering accurate 3D human pose in the wild using IMUs and a moving camera. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 601\u2013617 (2018)","DOI":"10.1007\/978-3-030-01249-6_37"},{"key":"5_CR70","unstructured":"Wang, J., Xu, H., Narasimhan, M., Wang, X.: Multi-person 3D motion prediction with multi-range transformers. In: Advances in Neural Information Processing Systems, vol. 34 (2021)"},{"key":"5_CR71","unstructured":"Wang, Z., Wang, J., Lin, D., Dai, B.: Intercontrol: generate human motion interactions by controlling every joint. arXiv preprint arXiv:2311.15864 (2023)"},{"key":"5_CR72","unstructured":"Wei, M., Miaomiao, L., Mathieu, S.: History repeats itself: Human motion prediction via motion attention. In: Proceedings of the European Conference on Computer Vision (2020)"},{"key":"5_CR73","doi-asserted-by":"crossref","unstructured":"Wei, W.L., Lin, J.C., Liu, T.L., Liao, H.Y.M.: Capturing humans in motion: temporal-attentive 3d human pose and shape estimation from monocular video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13211\u201313220 (2022)","DOI":"10.1109\/CVPR52688.2022.01286"},{"key":"5_CR74","unstructured":"Wu, Y., Kirillov, A., Massa, F., Lo, W.Y., Girshick, R.: Detectron2 (2019)"},{"key":"5_CR75","doi-asserted-by":"crossref","unstructured":"Yang, Z., et al.: OmnimotionGPT: animal motion generation with limited data. In: International Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.00125"},{"key":"5_CR76","unstructured":"Yao, P., Fang, Z., Wu, F., Feng, Y., Li, J.: Densebody: directly regressing dense 3D human pose and shape from a single color image. arXiv preprint arXiv:1903.10153 (2019)"},{"key":"5_CR77","doi-asserted-by":"crossref","unstructured":"Yu, Z., et al.: Skeleton2mesh: kinematics prior injected unsupervised human mesh recovery. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8619\u20138629 (2021)","DOI":"10.1109\/ICCV48922.2021.00850"},{"key":"5_CR78","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Iqbal, U., Molchanov, P., Kitani, K., Kautz, J.: GLAMR: global occlusion-aware human mesh recovery with dynamic cameras. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11038\u201311049 (2022)","DOI":"10.1109\/CVPR52688.2022.01076"},{"key":"5_CR79","doi-asserted-by":"crossref","unstructured":"Yuan, Y., et al.: Gavatar: Animatable 3D Gaussian avatars with implicit mesh learning. arXiv preprint arXiv:2312.11461 (2023)","DOI":"10.1109\/CVPR52733.2024.00091"},{"key":"5_CR80","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Song, J., Iqbal, U., Vahdat, A., Kautz, J.: PhysDiff: physics-guided human motion diffusion model. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01467"},{"key":"5_CR81","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Wei, S.E., Simon, T., Kitani, K., Saragih, J.: Simpoe: simulated character control for 3D human pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7159\u20137169 (2021)","DOI":"10.1109\/CVPR46437.2021.00708"},{"key":"5_CR82","doi-asserted-by":"crossref","unstructured":"Zanfir, A., Bazavan, E.G., Zanfir, M., Freeman, W.T., Sukthankar, R., Sminchisescu, C.: Neural descent for visual 3D human pose and shape. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14484\u201314493 (2021)","DOI":"10.1109\/CVPR46437.2021.01425"},{"key":"5_CR83","unstructured":"Zanfir, A., Marinoiu, E., Zanfir, M., Popa, A.I., Sminchisescu, C.: Deep network for the integrated 3D sensing of multiple people in natural images. In: Advances in Neural Information Processing Systems, vol. 31 (2018)"},{"key":"5_CR84","doi-asserted-by":"crossref","unstructured":"Zhai, Y., et al.: Language-guided human motion synthesis with atomic actions. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 5262\u20135271 (2023)","DOI":"10.1145\/3581783.3612289"},{"key":"5_CR85","doi-asserted-by":"crossref","unstructured":"Zhang, H., et al.: PyMAF: 3D human pose and shape regression with pyramidal mesh alignment feedback loop. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11446\u201311456 (2021)","DOI":"10.1109\/ICCV48922.2021.01125"},{"key":"5_CR86","doi-asserted-by":"crossref","unstructured":"Zhang, J., et al.: T2M-GPT: generating human motion from textual descriptions with discrete representations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.01415"},{"key":"5_CR87","doi-asserted-by":"crossref","unstructured":"Zhang, M., et al.: Remodiffuse: retrieval-augmented motion diffusion model. arXiv preprint arXiv:2304.01116 (2023)","DOI":"10.1109\/ICCV51070.2023.00040"},{"key":"5_CR88","doi-asserted-by":"crossref","unstructured":"Zhang, Y., An, L., Yu, T., Li, X., Li, K., Liu, Y.: 4D association graph for realtime multi-person motion capture using multiple video cameras. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1324\u20131333 (2020)","DOI":"10.1109\/CVPR42600.2020.00140"},{"key":"5_CR89","doi-asserted-by":"crossref","unstructured":"Zhao, Z., Bai, J., Chen, D., Wang, D., Pan, Y.: Taming diffusion models for music-driven conducting motion generation. arXiv preprint arXiv:2306.10065 (2023)","DOI":"10.1609\/aaaiss.v1i1.27474"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73650-6_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,20]],"date-time":"2024-11-20T19:03:23Z","timestamp":1732129403000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73650-6_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,21]]},"ISBN":["9783031736490","9783031736506"],"references-count":89,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73650-6_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,21]]},"assertion":[{"value":"21 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}