{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T18:03:05Z","timestamp":1775584985906,"version":"3.50.1"},"publisher-location":"Cham","reference-count":71,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729911","type":"print"},{"value":"9783031729928","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T00:00:00Z","timestamp":1730246400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T00:00:00Z","timestamp":1730246400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72992-8_8","type":"book-chapter","created":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T08:29:02Z","timestamp":1730190542000},"page":"126-143","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":13,"title":["ParCo: Part-Coordinating Text-to-Motion Synthesis"],"prefix":"10.1007","author":[{"given":"Qiran","family":"Zou","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8832-6372","authenticated-orcid":false,"given":"Shangyuan","family":"Yuan","sequence":"additional","affiliation":[]},{"given":"Shian","family":"Du","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Chang","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Yi","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Jie","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Xiangyang","family":"Ji","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,30]]},"reference":[{"key":"8_CR1","doi-asserted-by":"crossref","unstructured":"Ahn, H., Ha, T., Choi, Y., Yoo, H., Oh, S.: Text2Action: generative adversarial synthesis from language to action. In: ICRA (2018)","DOI":"10.1109\/ICRA.2018.8460608"},{"key":"8_CR2","doi-asserted-by":"crossref","unstructured":"Ahuja, C., Morency, L.P.: Language2Pose: natural language grounded pose forecasting. In: 3DV (2019)","DOI":"10.1109\/3DV.2019.00084"},{"key":"8_CR3","doi-asserted-by":"crossref","unstructured":"Antakli, A., Hermann, E., Zinnikus, I., Du, H., Fischer, K.: Intelligent distributed human motion simulation in human-robot collaboration environments. In: ACM IVA (2018)","DOI":"10.1145\/3267851.3267867"},{"key":"8_CR4","doi-asserted-by":"crossref","unstructured":"Athanasiou, N., Petrovich, M., Black, M.J., Varol, G.: SINC: spatial composition of 3d human motions for simultaneous action generation. arXiv preprint arXiv:2304.10417 (2023)","DOI":"10.1109\/ICCV51070.2023.00916"},{"key":"8_CR5","doi-asserted-by":"crossref","unstructured":"Barsoum, E., Kender, J., Liu, Z.: HP-GAN: probabilistic 3D human motion prediction via GAN. In: CVPRW (2018)","DOI":"10.1109\/CVPRW.2018.00191"},{"key":"8_CR6","doi-asserted-by":"crossref","unstructured":"Bhattacharya, U., Rewkowski, N., Banerjee, A., Guhan, P., Bera, A., Manocha, D.: Text2Gestures: a transformer-based network for generating emotive body gestures for virtual agents. In: 2021 IEEE Virtual Reality and 3D User Interfaces (VR) (2021)","DOI":"10.1109\/VR50410.2021.00037"},{"key":"8_CR7","doi-asserted-by":"crossref","unstructured":"Butepage, J., Black, M.J., Kragic, D., Kjellstrom, H.: Deep representation learning for human motion prediction and classification. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.173"},{"key":"8_CR8","doi-asserted-by":"crossref","unstructured":"Chen, X., et al.: Executing your commands via motion diffusion in latent space. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01726"},{"key":"8_CR9","unstructured":"Chen, X., et al.: Learning variational motion prior for video-based motion capture. arXiv preprint arXiv:2210.15134 (2022)"},{"key":"8_CR10","doi-asserted-by":"crossref","unstructured":"Djuric, N., et al.: Uncertainty-aware short-term motion prediction of traffic actors for autonomous driving. In: WACV (2020)","DOI":"10.1109\/WACV45572.2020.9093332"},{"key":"8_CR11","unstructured":"Duan, Y., et al.: Single-shot motion completion with transformer. arXiv preprint arXiv:2103.00776 (2021)"},{"key":"8_CR12","doi-asserted-by":"crossref","unstructured":"Fragkiadaki, K., Levine, S., Felsen, P., Malik, J.: Recurrent network models for human dynamics. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.494"},{"key":"8_CR13","doi-asserted-by":"crossref","unstructured":"Ghosh, A., Cheema, N., Oguz, C., Theobalt, C., Slusallek, P.: Synthesis of compositional animations from textual descriptions. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00143"},{"key":"8_CR14","doi-asserted-by":"crossref","unstructured":"Guo, C., et al.: Generating diverse and natural 3D human motions from text. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"8_CR15","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"580","DOI":"10.1007\/978-3-031-19833-5_34","volume-title":"Computer Vision \u2013 ECCV 2022","author":"C Guo","year":"2022","unstructured":"Guo, C., Zuo, X., Wang, S., Cheng, L.: TM2T: stochastic and tokenized modeling for the reciprocal generation of 3D human motions and texts. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13695, pp. 580\u2013597. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19833-5_34"},{"key":"8_CR16","doi-asserted-by":"crossref","unstructured":"Guo, C., et al.: Action2Motion: conditioned generation of 3D human motions. In: ACM MM (2020)","DOI":"10.1145\/3394171.3413635"},{"key":"8_CR17","doi-asserted-by":"crossref","unstructured":"Harvey, F.G., Pal, C.: Recurrent transition networks for character locomotion. In: SIGGRAPH (2018)","DOI":"10.1145\/3283254.3283277"},{"key":"8_CR18","doi-asserted-by":"crossref","unstructured":"Harvey, F.G., Yurick, M., Nowrouzezahrai, D., Pal, C.: Robust motion in-betweening. ACM TOG (2020)","DOI":"10.1145\/3386569.3392480"},{"key":"8_CR19","doi-asserted-by":"crossref","unstructured":"Herbet, G., Duffau, H.: Revisiting the functional anatomy of the human brain: toward a meta-networking theory of cerebral functions. Physiol. Rev. (2020)","DOI":"10.1152\/physrev.00033.2019"},{"key":"8_CR20","doi-asserted-by":"crossref","unstructured":"Hernandez, A., Gall, J., Moreno-Noguer, F.: Human motion prediction via spatio-temporal inpainting. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00723"},{"key":"8_CR21","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local nash equilibrium. In: NeurIPS (2017)"},{"key":"8_CR22","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: NeurIPS (2020)"},{"key":"8_CR23","doi-asserted-by":"crossref","unstructured":"Kappel, M., et al.: High-fidelity neural human motion transfer from monocular video. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00159"},{"key":"8_CR24","doi-asserted-by":"crossref","unstructured":"Kaufmann, M., Aksan, E., Song, J., Pece, F., Ziegler, R., Hilliges, O.: Convolutional autoencoders for human motion infilling. In: 3DV (2020)","DOI":"10.1109\/3DV50981.2020.00102"},{"key":"8_CR25","doi-asserted-by":"crossref","unstructured":"Kim, J., Kim, J., Choi, S.: FLAME: free-form language-based motion synthesis & editing. In: AAAI (2023)","DOI":"10.1609\/aaai.v37i7.25996"},{"key":"8_CR26","unstructured":"Koppula, H., Saxena, A.: Learning spatio-temporal structure from RGB-D videos for human activity detection and anticipation. In: ICML (2013)"},{"key":"8_CR27","doi-asserted-by":"crossref","unstructured":"Koppula, H.S., Saxena, A.: Anticipating human activities using object affordances for reactive robotic response. IEEE TPAMI (2015)","DOI":"10.1109\/TPAMI.2015.2430335"},{"key":"8_CR28","unstructured":"Kullback, S.: Information theory and statistics (1997)"},{"key":"8_CR29","unstructured":"Lab, C.G.: CMU graphics lab motion capture database (2016)"},{"key":"8_CR30","unstructured":"Lee, H.Y., et al.: Dancing to music. In: NeurIPS (2019)"},{"key":"8_CR31","doi-asserted-by":"crossref","unstructured":"Li, B., Zhao, Y., Zhelun, S., Sheng, L.: DanceFormer: music conditioned 3D dance generation with parametric motion transformer. In: AAAI (2022)","DOI":"10.1609\/aaai.v36i2.20014"},{"key":"8_CR32","doi-asserted-by":"crossref","unstructured":"Li, R., Yang, S., Ross, D.A., Kanazawa, A.: AI choreographer: music conditioned 3D dance generation with AIST++. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01315"},{"key":"8_CR33","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"8_CR34","doi-asserted-by":"crossref","unstructured":"Loper, M., Mahmood, N., Romero, J., Pons-Moll, G., Black, M.J.: SMPL: a skinned multi-person linear model. In: Seminal Graphics Papers: Pushing the Boundaries, vol. 2 (2023)","DOI":"10.1145\/3596711.3596800"},{"key":"8_CR35","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"8_CR36","doi-asserted-by":"crossref","unstructured":"Mahmood, N., Ghorbani, N., Troje, N.F., Pons-Moll, G., Black, M.J.: AMASS: archive of motion capture as surface shapes. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00554"},{"key":"8_CR37","doi-asserted-by":"crossref","unstructured":"Majoe, D., Widmer, L., Gutknecht, J.: Enhanced motion interaction for multimedia applications. In: Proceedings of the 7th International Conference on Advances in Mobile Computing and Multimedia (2009)","DOI":"10.1145\/1821748.1821760"},{"key":"8_CR38","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"474","DOI":"10.1007\/978-3-030-58568-6_28","volume-title":"Computer Vision \u2013 ECCV 2020","author":"W Mao","year":"2020","unstructured":"Mao, W., Liu, M., Salzmann, M.: History repeats itself: human motion prediction via motion attention. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12359, pp. 474\u2013489. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58568-6_28"},{"key":"8_CR39","doi-asserted-by":"crossref","unstructured":"Mao, W., Liu, M., Salzmann, M., Li, H.: Learning trajectory dependencies for human motion prediction. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00958"},{"key":"8_CR40","doi-asserted-by":"crossref","unstructured":"Martinez, J., Black, M.J., Romero, J.: On human motion prediction using recurrent neural networks. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.497"},{"key":"8_CR41","unstructured":"Nichol, A., et al.: GLIDE: towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)"},{"key":"8_CR42","unstructured":"Parmar, N., et al.: Image transformer. In: ICML (2018)"},{"key":"8_CR43","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., et al.: Expressive body capture: 3D hands, face, and body from a single image. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.01123"},{"key":"8_CR44","unstructured":"Pavllo, D., Grangier, D., Auli, M.: QuaterNet: a quaternion-based recurrent model for human motion. arXiv preprint arXiv:1805.06485 (2018)"},{"key":"8_CR45","doi-asserted-by":"crossref","unstructured":"Petrovich, M., Black, M.J., Varol, G.: Action-conditioned 3D human motion synthesis with transformer VAE. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01080"},{"key":"8_CR46","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"480","DOI":"10.1007\/978-3-031-20047-2_28","volume-title":"Computer Vision \u2013 ECCV 2022","author":"M Petrovich","year":"2022","unstructured":"Petrovich, M., Black, M.J., Varol, G.: TEMOS: generating diverse human motions from textual descriptions. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13682, pp. 480\u2013497. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20047-2_28"},{"key":"8_CR47","doi-asserted-by":"crossref","unstructured":"Plappert, M., Mandery, C., Asfour, T.: The KIT motion-language dataset. Big Data (2016)","DOI":"10.1089\/big.2016.0028"},{"key":"8_CR48","doi-asserted-by":"crossref","unstructured":"Plappert, M., Mandery, C., Asfour, T.: Learning a bidirectional mapping between human whole-body motion and natural language using deep recurrent neural networks. Robot. Auton. Syst. (2018)","DOI":"10.1016\/j.robot.2018.07.006"},{"key":"8_CR49","doi-asserted-by":"crossref","unstructured":"Raab, S., Leibovitch, I., Li, P., Aberman, K., Sorkine-Hornung, O., Cohen-Or, D.: MoDi: unconditional motion synthesis from diverse data. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01333"},{"key":"8_CR50","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"8_CR51","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with CLIP latents. arXiv preprint arXiv:2204.06125 (2022)"},{"key":"8_CR52","doi-asserted-by":"crossref","unstructured":"Rempe, D., Birdal, T., Hertzmann, A., Yang, J., Sridhar, S., Guibas, L.J.: HuMoR: 3D human motion model for robust pose estimation. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01129"},{"key":"8_CR53","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"8_CR54","doi-asserted-by":"crossref","unstructured":"Saharia, C., et\u00a0al.: Photorealistic text-to-image diffusion models with deep language understanding. In: NeurIPS (2022)","DOI":"10.1145\/3528233.3530757"},{"key":"8_CR55","doi-asserted-by":"crossref","unstructured":"Thiebaut\u00a0de Schotten, M., Forkel, S.J.: The emergent properties of the connected brain. Science (2022)","DOI":"10.1126\/science.abq2591"},{"key":"8_CR56","unstructured":"Shafir, Y., Tevet, G., Kapon, R., Bermano, A.H.: Human motion diffusion as a generative prior. arXiv preprint arXiv:2303.01418 (2023)"},{"key":"8_CR57","doi-asserted-by":"crossref","unstructured":"Siyao, L., et al.: Bailando: 3D dance generation by actor-critic GPT with choreographic memory. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01077"},{"key":"8_CR58","doi-asserted-by":"crossref","unstructured":"Tang, X., et al.: Real-time controllable motion transition for characters. ACM TOG (2022)","DOI":"10.1145\/3528223.3530090"},{"key":"8_CR59","doi-asserted-by":"crossref","unstructured":"Terlemez, \u00d6., Ulbrich, S., Mandery, C., Do, M., Vahrenkamp, N., Asfour, T.: Master motor map (MMM)-framework and toolkit for capturing, representing, and reproducing human motion on humanoid robots. In: 2014 IEEE-RAS International Conference on Humanoid Robots (2014)","DOI":"10.1109\/HUMANOIDS.2014.7041470"},{"key":"8_CR60","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"358","DOI":"10.1007\/978-3-031-20047-2_21","volume-title":"Computer Vision \u2013 ECCV 2022","author":"G Tevet","year":"2022","unstructured":"Tevet, G., Gordon, B., Hertz, A., Bermano, A.H., Cohen-Or, D.: MotionCLIP: exposing human motion generation to clip space. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13682, pp. 358\u2013373. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20047-2_21"},{"key":"8_CR61","unstructured":"Tevet, G., Raab, S., Gordon, B., Shafir, Y., Cohen-Or, D., Bermano, A.H.: Human motion diffusion model. arXiv preprint arXiv:2209.14916 (2022)"},{"key":"8_CR62","unstructured":"Vaswani, A., et al.: Attention is all you need. NeurIPS (2017)"},{"key":"8_CR63","doi-asserted-by":"crossref","unstructured":"Wang, Y., Leng, Z., Li, F.W., Wu, S.C., Liang, X.: FG-T2M: fine-grained text-driven human motion generation via diffusion model. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.02014"},{"key":"8_CR64","doi-asserted-by":"crossref","unstructured":"Yan, S., Li, Z., Xiong, Y., Yan, H., Lin, D.: Convolutional sequence generation for skeleton-based action synthesis. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00449"},{"key":"8_CR65","doi-asserted-by":"crossref","unstructured":"Yeasin, M., Polat, E., Sharma, R.: A multiobject tracking framework for interactive multimedia applications. IEEE TMM (2004)","DOI":"10.1109\/TMM.2004.827514"},{"key":"8_CR66","doi-asserted-by":"crossref","unstructured":"Zhang, J., et al.: T2M-GPT: generating human motion from textual descriptions with discrete representations. arXiv preprint arXiv:2301.06052 (2023)","DOI":"10.1109\/CVPR52729.2023.01415"},{"key":"8_CR67","unstructured":"Zhang, M., et al.: MotionDiffuse: text-driven human motion generation with diffusion model. arXiv preprint arXiv:2208.15001 (2022)"},{"key":"8_CR68","doi-asserted-by":"crossref","unstructured":"Zhang, M., et al.: RemoDiffuse: retrieval-augmented motion diffusion model. arXiv preprint arXiv:2304.01116 (2023)","DOI":"10.1109\/ICCV51070.2023.00040"},{"key":"8_CR69","unstructured":"Zhang, Y., Black, M.J., Tang, S.: Perpetual motion: generating unbounded human motion. arXiv preprint arXiv:2007.13886 (2020)"},{"key":"8_CR70","doi-asserted-by":"crossref","unstructured":"Zhao, R., Su, H., Ji, Q.: Bayesian adversarial human motion synthesis. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00626"},{"key":"8_CR71","doi-asserted-by":"crossref","unstructured":"Zhong, C., Hu, L., Zhang, Z., Xia, S.: ATTT2M: text-driven human motion generation with multi-perspective attention mechanism. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00053"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72992-8_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T08:43:19Z","timestamp":1730191399000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72992-8_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,30]]},"ISBN":["9783031729911","9783031729928"],"references-count":71,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72992-8_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,30]]},"assertion":[{"value":"30 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}