{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T10:11:26Z","timestamp":1767262286161,"version":"3.40.3"},"publisher-location":"Cham","reference-count":51,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031733369"},{"type":"electronic","value":"9783031733376"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73337-6_23","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T23:02:27Z","timestamp":1730329347000},"page":"403-421","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["FreeMotion: MoCap-Free Human Motion Synthesis with\u00a0Multimodal Large Language Models"],"prefix":"10.1007","author":[{"given":"Zhikai","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Yitang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Haofeng","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Mingxian","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Li","family":"Yi","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"23_CR1","unstructured":"GPT-4V(ision) system card (2023). https:\/\/api.semanticscholar.org\/CorpusID:263218031"},{"issue":"4","key":"23_CR2","doi-asserted-by":"publisher","first-page":"64","DOI":"10.1145\/3386569.3392469","volume":"39","author":"K Aberman","year":"2020","unstructured":"Aberman, K., Weng, Y., Lischinski, D., Cohen-Or, D., Chen, B.: Unpaired motion style transfer from video to animation. ACM Trans. Graph. (TOG) 39(4), 64 (2020)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"23_CR3","unstructured":"Bommasani, R., et\u00a0al.: On the opportunities and risks of foundation models. arXiv preprint arXiv:2108.07258 (2021)"},{"key":"23_CR4","unstructured":"Brown, T., et al.: Language models are few-shot learners. In: Advances in Neural Information Processing Systems, vol. 33, pp. 1877\u20131901 (2020)"},{"key":"23_CR5","unstructured":"Chang, A.X., et\u00a0al.: ShapeNet: an information-rich 3D model repository. arXiv preprint arXiv:1512.03012 (2015)"},{"key":"23_CR6","doi-asserted-by":"crossref","unstructured":"Chen, X., et al.: Executing your commands via motion diffusion in latent space. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18000\u201318010 (2023)","DOI":"10.1109\/CVPR52729.2023.01726"},{"issue":"240","key":"23_CR7","first-page":"1","volume":"24","author":"A Chowdhery","year":"2023","unstructured":"Chowdhery, A., et al.: PaLM: scaling language modeling with pathways. J. Mach. Learn. Res. 24(240), 1\u2013113 (2023)","journal-title":"J. Mach. Learn. Res."},{"key":"23_CR8","unstructured":"Dong, R., et\u00a0al.: DreamLLM: synergistic multimodal comprehension and creation. arXiv preprint arXiv:2309.11499 (2023)"},{"issue":"6","key":"23_CR9","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3478513.3480527","volume":"40","author":"L Fussell","year":"2021","unstructured":"Fussell, L., Bergamin, K., Holden, D.: SuperTrack: motion tracking for physically simulated characters using supervised learning. ACM Trans. Graph. (TOG) 40(6), 1\u201313 (2021)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"23_CR10","doi-asserted-by":"crossref","unstructured":"Ghosh, A., Dabral, R., Golyanik, V., Theobalt, C., Slusallek, P.: IMoS: intent-driven full-body motion synthesis for human-object interactions. In: Computer Graphics Forum, vol.\u00a042, pp. 1\u201312. Wiley Online Library (2023)","DOI":"10.1111\/cgf.14739"},{"key":"23_CR11","doi-asserted-by":"crossref","unstructured":"Guo, C., et al.: Generating diverse and natural 3D human motions from text. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5152\u20135161 (2022)","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"23_CR12","doi-asserted-by":"crossref","unstructured":"Guo, C., et al.: Action2Motion: conditioned generation of 3D human motions. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 2021\u20132029 (2020)","DOI":"10.1145\/3394171.3413635"},{"key":"23_CR13","doi-asserted-by":"crossref","unstructured":"Hassan, M., et al.: Stochastic scene-aware motion prediction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11374\u201311384 (2021)","DOI":"10.1109\/ICCV48922.2021.01118"},{"key":"23_CR14","doi-asserted-by":"crossref","unstructured":"Hassan, M., Guo, Y., Wang, T., Black, M., Fidler, S., Peng, X.B.: Synthesizing physical character-scene interactions. arXiv preprint arXiv:2302.00883 (2023)","DOI":"10.1145\/3588432.3591525"},{"issue":"4","key":"23_CR15","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2897824.2925975","volume":"35","author":"D Holden","year":"2016","unstructured":"Holden, D., Saito, J., Komura, T.: A deep learning framework for character motion synthesis and editing. ACM Trans. Graph. (TOG) 35(4), 1\u201311 (2016)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"23_CR16","doi-asserted-by":"crossref","unstructured":"Hong, F., Zhang, M., Pan, L., Cai, Z., Yang, L., Liu, Z.: AvatarCLIP: zero-shot text-driven generation and animation of 3D avatars. arXiv preprint arXiv:2205.08535 (2022)","DOI":"10.1145\/3528223.3530094"},{"key":"23_CR17","unstructured":"Hu, Y., Lin, F., Zhang, T., Yi, L., Gao, Y.: Look before you leap: unveiling the power of GPT-4V in robotic vision-language planning. arXiv preprint arXiv:2311.17842 (2023)"},{"key":"23_CR18","unstructured":"Huang, W., Wang, C., Zhang, R., Li, Y., Wu, J., Fei-Fei, L.: VoxPoser: composable 3D value maps for robotic manipulation with language models. arXiv preprint arXiv:2307.05973 (2023)"},{"key":"23_CR19","unstructured":"Jiang, B., Chen, X., Liu, W., Yu, J., Yu, G., Chen, T.: MotionGPT: human motion as a foreign language. arXiv preprint arXiv:2306.14795 (2023)"},{"issue":"6","key":"23_CR20","first-page":"1","volume":"42","author":"J Li","year":"2023","unstructured":"Li, J., Wu, J., Liu, C.K.: Object motion guided human motion synthesis. ACM Trans. Graph. (TOG) 42(6), 1\u201311 (2023)","journal-title":"ACM Trans. Graph. (TOG)"},{"issue":"4","key":"23_CR21","first-page":"1","volume":"41","author":"P Li","year":"2022","unstructured":"Li, P., Aberman, K., Zhang, Z., Hanocka, R., Sorkine-Hornung, O.: GANimator: neural motion synthesis from a single sequence. ACM Trans. Graph. (TOG) 41(4), 1\u201312 (2022)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"23_CR22","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lyu, K., Wu, S., Chen, H., Hao, Y., Ji, S.: Aggregated multi-GANs for controlled 3D human motion prediction. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a035, pp. 2225\u20132232 (2021)","DOI":"10.1609\/aaai.v35i3.16321"},{"key":"23_CR23","unstructured":"Ma, Y.J., et al.: LIV: language-image representations and rewards for robotic control (2023)"},{"key":"23_CR24","unstructured":"Ma, Y.J., et al.: Eureka: human-level reward design via coding large language models. arXiv preprint arXiv:2310.12931 (2023)"},{"key":"23_CR25","doi-asserted-by":"crossref","unstructured":"Mahmood, N., Ghorbani, N., Troje, N.F., Pons-Moll, G., Black, M.J.: AMASS: archive of motion capture as surface shapes. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5442\u20135451 (2019)","DOI":"10.1109\/ICCV.2019.00554"},{"key":"23_CR26","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. In: Advances in Neural Information Processing Systems, vol. 35, pp. 27730\u201327744 (2022)"},{"issue":"4","key":"23_CR27","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3528223.3530110","volume":"41","author":"XB Peng","year":"2022","unstructured":"Peng, X.B., Guo, Y., Halper, L., Levine, S., Fidler, S.: ASE: large-scale reusable adversarial skill embeddings for physically simulated characters. ACM Trans. Graph. (TOG) 41(4), 1\u201317 (2022)","journal-title":"ACM Trans. Graph. (TOG)"},{"issue":"4","key":"23_CR28","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3450626.3459670","volume":"40","author":"XB Peng","year":"2021","unstructured":"Peng, X.B., Ma, Z., Abbeel, P., Levine, S., Kanazawa, A.: AMP: adversarial motion priors for stylized physics-based character control. ACM Trans. Graph. (ToG) 40(4), 1\u201320 (2021)","journal-title":"ACM Trans. Graph. (ToG)"},{"key":"23_CR29","doi-asserted-by":"publisher","unstructured":"Petrovich, M., Black, M.J., Varol, G.: TEMOS: generating diverse human motions from textual descriptions. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision, ECCV 2022. LNCS, vol. 13682, pp. 480\u2013497. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20047-2_28","DOI":"10.1007\/978-3-031-20047-2_28"},{"key":"23_CR30","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"23_CR31","doi-asserted-by":"crossref","unstructured":"Rempe, D., et al.: Trace and pace: controllable pedestrian animation via guided trajectory diffusion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13756\u201313766 (2023)","DOI":"10.1109\/CVPR52729.2023.01322"},{"key":"23_CR32","unstructured":"Ren, J., Zhang, M., Yu, C., Ma, X., Pan, L., Liu, Z.: InsActor: instruction-driven physics-based characters (2023)"},{"key":"23_CR33","unstructured":"Rocamonde, J., Montesinos, V., Nava, E., Perez, E., Lindner, D.: Vision-language models are zero-shot reward models for reinforcement learning. arXiv preprint arXiv:2310.12921 (2023)"},{"key":"23_CR34","unstructured":"Smith, R., et\u00a0al.: Open dynamics engine (2005)"},{"key":"23_CR35","unstructured":"Gemini Team Google, et\u00a0al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"23_CR36","doi-asserted-by":"publisher","unstructured":"Tevet, G., Gordon, B., Hertz, A., Bermano, A.H., Cohen-Or, D.: MotionCLIP: exposing human motion generation to CLIP space. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision, ECCV 2022. LNCS, vol. 13682, pp. 358\u2013374. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20047-2_21","DOI":"10.1007\/978-3-031-20047-2_21"},{"key":"23_CR37","unstructured":"Tevet, G., Raab, S., Gordon, B., Shafir, Y., Cohen-Or, D., Bermano, A.H.: Human motion diffusion model. arXiv preprint arXiv:2209.14916 (2022)"},{"key":"23_CR38","unstructured":"Wang, Z., Chen, Y., Liu, T., Zhu, Y., Liang, W., Huang, S.: HUMANISE: language-conditioned human motion generation in 3D scenes (2022)"},{"key":"23_CR39","doi-asserted-by":"crossref","unstructured":"Wei, D., et al.: Enhanced fine-grained motion diffusion for text-driven human motion synthesis (2023)","DOI":"10.1609\/aaai.v38i6.28401"},{"issue":"4","key":"23_CR40","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3528223.3530067","volume":"41","author":"J Won","year":"2022","unstructured":"Won, J., Gopinath, D., Hodgins, J.: Physics-based character controllers using conditional VAEs. ACM Trans. Graph. (TOG) 41(4), 1\u201312 (2022)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"23_CR41","unstructured":"Xiao, Z., et al.: Unified human-scene interaction via prompted chain-of-contacts. arXiv preprint arXiv:2309.07918 (2023)"},{"key":"23_CR42","doi-asserted-by":"crossref","unstructured":"Xie, Z., Ling, H.Y., Kim, N.H., van\u00a0de Panne, M.: ALLSTEPS: curriculum-driven learning of stepping stone skills. In: Computer Graphics Forum, vol.\u00a039, pp. 213\u2013224. Wiley Online Library (2020)","DOI":"10.1111\/cgf.14115"},{"key":"23_CR43","doi-asserted-by":"crossref","unstructured":"Xu, S., Li, Z., Wang, Y.X., Gui, L.Y.: InterDiff: generating 3D human-object interactions with physics-informed diffusion (2023)","DOI":"10.1109\/ICCV51070.2023.01371"},{"key":"23_CR44","unstructured":"Yang, Z., et al.: The dawn of LMMs: preliminary explorations with GPT-4V (ision). arXiv preprint arXiv:2309.17421, 9(1), 1 (2023)"},{"issue":"6","key":"23_CR45","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3550454.3555434","volume":"41","author":"H Yao","year":"2022","unstructured":"Yao, H., Song, Z., Chen, B., Liu, L.: ControlVAE: model-based learning of generative controllers for physics-based characters. ACM Trans. Graph. (TOG) 41(6), 1\u201316 (2022)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"23_CR46","unstructured":"Yu, W., et\u00a0al.: Language to rewards for robotic skill synthesis. arXiv preprint arXiv:2306.08647 (2023)"},{"key":"23_CR47","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Song, J., Iqbal, U., Vahdat, A., Kautz, J.: PhysDiff: physics-guided human motion diffusion model. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 16010\u201316021 (2023)","DOI":"10.1109\/ICCV51070.2023.01467"},{"key":"23_CR48","unstructured":"Zhang, M., et al.: MotionDiffuse: text-driven human motion generation with diffusion model. arXiv preprint arXiv:2208.15001 (2022)"},{"key":"23_CR49","unstructured":"Zhang, Y., et al.: MotionGPT: finetuned LLMs are general-purpose motion generators. arXiv preprint arXiv:2306.10900 (2023)"},{"key":"23_CR50","doi-asserted-by":"crossref","unstructured":"Zhao, K., Wang, S., Zhang, Y., Beeler, T., Tang, S.: Compositional human-scene interaction synthesis with semantic control (2022)","DOI":"10.1007\/978-3-031-20068-7_18"},{"key":"23_CR51","doi-asserted-by":"crossref","unstructured":"Zhao, K., Zhang, Y., Wang, S., Beeler, T., Tang, S.: Synthesizing diverse human motions in 3D indoor scenes (2023)","DOI":"10.1109\/ICCV51070.2023.01354"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73337-6_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T23:07:07Z","timestamp":1730329627000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73337-6_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031733369","9783031733376"],"references-count":51,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73337-6_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}