{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:40:34Z","timestamp":1765309234075,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China","award":["92470121, 62402016"],"award-info":[{"award-number":["92470121, 62402016"]}]},{"name":"National Key R\\&D Program of China","award":["2024YFA1014003"],"award-info":[{"award-number":["2024YFA1014003"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754850","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:56:44Z","timestamp":1761375404000},"page":"9404-9413","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Human Motion Generation in 3D Scenes from Open-Ended Textual Instructions with MLLM Planning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-2793-0067","authenticated-orcid":false,"given":"Siyi","family":"Qian","sequence":"first","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7018-6958","authenticated-orcid":false,"given":"Jian","family":"Fang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Harbin, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4374-9016","authenticated-orcid":false,"given":"Yuzhou","family":"Mao","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Harbin, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0517-3188","authenticated-orcid":false,"given":"Yayun","family":"Zou","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Harbin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7532-5550","authenticated-orcid":false,"given":"Wentao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7318-9682","authenticated-orcid":false,"given":"Haiwei","family":"Xue","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. GPT-4 technical report (2023). arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_25"},{"key":"e_1_3_2_1_3_1","volume-title":"Efficient Multi-Person Motion Prediction by Lightweight Spatial and Temporal Interactions. arXiv preprint arXiv:2507.09446","author":"Anonymous","year":"2025","unstructured":"Anonymous. 2025. Efficient Multi-Person Motion Prediction by Lightweight Spatial and Temporal Interactions. arXiv preprint arXiv:2507.09446 (2025). arXiv:2507.09446 [cs.CV]"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/3DV57658.2022.00053"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00062"},{"key":"e_1_3_2_1_6_1","volume-title":"Multi-Modal Diffusion for Hand-Object Grasp Generation. arXiv preprint arXiv:2409.04560","author":"Cao Jinkun","year":"2024","unstructured":"Jinkun Cao, Jingyuan Liu, Kris Kitani, and Yi Zhou. 2024. Multi-Modal Diffusion for Hand-Object Grasp Generation. arXiv preprint arXiv:2409.04560 (2024)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00182"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"e_1_3_2_1_9_1","volume-title":"Adam: A method for stochastic optimization. (No Title)","author":"Diederik Kingma","year":"2014","unstructured":"Kingma Diederik. 2014. Adam: A method for stochastic optimization. (No Title) (2014)."},{"key":"e_1_3_2_1_10_1","unstructured":"Martin Ester Hans-Peter Kriegel J\u00f6rg Sander Xiaowei Xu et al. 1996. A density-based algorithm for discovering clusters in large spatial databases with noise. In kdd Vol. 96. 226-231."},{"key":"e_1_3_2_1_11_1","volume-title":"European Conference on Computer Vision. Springer, 418-437","author":"Ghosh Anindita","year":"2024","unstructured":"Anindita Ghosh, Rishabh Dabral, Vladislav Golyanik, Christian Theobalt, and Philipp Slusallek. 2024. Remos: 3d motion-conditioned reaction synthesis for two-person interactions. In European Conference on Computer Vision. Springer, 418-437."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413635"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01118"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00237"},{"key":"e_1_3_2_1_16_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems, Vol. 33 (2020), 6840-6851."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01607"},{"key":"e_1_3_2_1_18_1","first-page":"20067","article-title":"Motiongpt: Human motion as a foreign language","volume":"36","author":"Jiang Biao","year":"2023","unstructured":"Biao Jiang, Xin Chen, Wen Liu, Jingyi Yu, Gang Yu, and Tao Chen. 2023. Motiongpt: Human motion as a foreign language. Advances in Neural Information Processing Systems, Vol. 36 (2023), 20067-20079.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00171"},{"key":"e_1_3_2_1_20_1","volume-title":"European Conference on Computer Vision. Springer, 392-409","author":"Jin Peng","year":"2024","unstructured":"Peng Jin, Hao Li, Zesen Cheng, Kehan Li, Runyi Yu, Chang Liu, Xiangyang Ji, Li Yuan, and Jie Chen. 2024. Local action-guided motion diffusion model for text-to-motion generation. In European Conference on Computer Vision. Springer, 392-409."},{"key":"e_1_3_2_1_21_1","volume-title":"Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa.","author":"Kojima Takeshi","year":"2022","unstructured":"Takeshi Kojima, Shixiang Shane Gu, Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa. 2022. Large language models are zero-shot reasoners. Advances in neural information processing systems, Vol. 35 (2022), 22199-22213."},{"key":"e_1_3_2_1_22_1","volume-title":"Dancing to music. Advances in neural information processing systems","author":"Lee Hsin-Ying","year":"2019","unstructured":"Hsin-Ying Lee, Xiaodong Yang, Ming-Yu Liu, Ting-Chun Wang, Yu-Ding Lu, Ming-Hsuan Yang, and Jan Kautz. 2019. Dancing to music. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_23_1","unstructured":"Jing Li Di Kang Wenjie Pei Xuefei Zhe and Linchao Bao. 2021. Audio2Gestures: Generating Diverse Gestures from Speech Audio with Conditional Variational Autoencoders. (2021)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00462"},{"key":"e_1_3_2_1_25_1","volume-title":"HIMO: A New Benchmark for Full-Body Human Interacting with Multiple Objects. In European Conference on Computer Vision. Springer, 300-318","author":"Lv Xintao","year":"2024","unstructured":"Xintao Lv, Liang Xu, Yichao Yan, Xin Jin, Congsheng Xu, Shuwen Wu, Yifan Liu, Lincheng Li, Mengxiao Bi, Wenjun Zeng, et al., 2024. HIMO: A New Benchmark for Full-Body Human Interacting with Multiple Objects. In European Conference on Computer Vision. Springer, 300-318."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00554"},{"key":"e_1_3_2_1_27_1","unstructured":"Ben Mann N Ryder M Subbiah J Kaplan P Dhariwal A Neelakantan P Shyam G Sastry A Askell S Agarwal et al. 2020. Language models are few-shot learners. arXiv preprint arXiv:2005.14165 Vol. 1 (2020) 3."},{"key":"e_1_3_2_1_28_1","first-page":"7356","article-title":"Contact-aware human motion forecasting","volume":"35","author":"Mao Wei","year":"2022","unstructured":"Wei Mao, Richard I Hartley, Mathieu Salzmann, et al., 2022. Contact-aware human motion forecasting. Advances in Neural Information Processing Systems, Vol. 35 (2022), 7356-7367.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_29_1","first-page":"20","article-title":"A comparison of RRT, RRT* and RRT*-smart path planning algorithms","volume":"16","author":"Noreen Iram","year":"2016","unstructured":"Iram Noreen, Amna Khan, and Zulfiqar Habib. 2016. A comparison of RRT, RRT* and RRT*-smart path planning algorithms. International Journal of Computer Science and Network Security (IJCSNS), Vol. 16, 10 (2016), 20.","journal-title":"International Journal of Computer Science and Network Security (IJCSNS)"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01123"},{"key":"e_1_3_2_1_31_1","volume-title":"Hoi-diff: Text-driven synthesis of 3d human-object interactions using diffusion models. arXiv preprint arXiv:2312.06553","author":"Peng Xiaogang","year":"2023","unstructured":"Xiaogang Peng, Yiming Xie, Zizhao Wu, Varun Jampani, Deqing Sun, and Huaizu Jiang. 2023. Hoi-diff: Text-driven synthesis of 3d human-object interactions using diffusion models. arXiv preprint arXiv:2312.06553 (2023)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01080"},{"key":"e_1_3_2_1_33_1","volume-title":"Single motion diffusion. arXiv preprint arXiv:2302.05905","author":"Raab Sigal","year":"2023","unstructured":"Sigal Raab, Inbal Leibovitch, Guy Tevet, Moab Arar, Amit H Bermano, and Daniel Cohen-Or. 2023. Single motion diffusion. arXiv preprint arXiv:2302.05905 (2023)."},{"key":"e_1_3_2_1_34_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02008"},{"key":"e_1_3_2_1_36_1","volume-title":"Human motion diffusion model. arXiv preprint arXiv:2209.14916","author":"Tevet Guy","year":"2022","unstructured":"Guy Tevet, Sigal Raab, Brian Gordon, Yonatan Shafir, Daniel Cohen-Or, and Amit H Bermano. 2022. Human motion diffusion model. arXiv preprint arXiv:2209.14916 (2022)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00051"},{"key":"e_1_3_2_1_38_1","volume-title":"Purposer: Putting Human Motion Generation in Context. In 2024 International Conference on 3D Vision (3DV). IEEE, 1310-1319","author":"Ugrinovic Nicolas","year":"2024","unstructured":"Nicolas Ugrinovic, Thomas Lucas, Fabien Baradel, Philippe Weinzaepfel, Gr\u00e9gory Rogez, and Francesc Moreno-Noguer. 2024. Purposer: Putting Human Motion Generation in Context. In 2024 International Conference on 3D Vision (3DV). IEEE, 1310-1319."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01981"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00928"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00049"},{"key":"e_1_3_2_1_42_1","first-page":"14959","article-title":"Humanise: Language-conditioned human motion generation in 3d scenes","volume":"35","author":"Wang Zan","year":"2022","unstructured":"Zan Wang, Yixin Chen, Tengyu Liu, Yixin Zhu, Wei Liang, and Siyuan Huang. 2022a. Humanise: Language-conditioned human motion generation in 3d scenes. Advances in Neural Information Processing Systems, Vol. 35 (2022), 14959-14971.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_43_1","first-page":"105397","article-title":"InterControl: Zero-shot Human Interaction Generation by Controlling Every Joint","volume":"37","author":"Wang Zhenzhi","year":"2024","unstructured":"Zhenzhi Wang, Jingbo Wang, Yixuan Li, Dahua Lin, and Bo Dai. 2024b. InterControl: Zero-shot Human Interaction Generation by Controlling Every Joint. Advances in Neural Information Processing Systems, Vol. 37 (2024), 105397-105424.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_44_1","volume-title":"Unified human-scene interaction via prompted chain-of-contacts. arXiv preprint arXiv:2309.07918","author":"Xiao Zeqi","year":"2023","unstructured":"Zeqi Xiao, Tai Wang, Jingbo Wang, Jinkun Cao, Wenwei Zhang, Bo Dai, Dahua Lin, and Jiangmiao Pang. 2023. Unified human-scene interaction via prompted chain-of-contacts. arXiv preprint arXiv:2309.07918 (2023)."},{"key":"e_1_3_2_1_45_1","unstructured":"Zeqi Xiao Tai Wang Jingbo Wang Jinkun Cao Wenwei Zhang Bo Dai Dahua Lin and Jiangmiao Pang. 2024. Unified Human-Scene Interaction via Prompted Chain-of-Contacts. arXiv:2309.07918 [cs.CV] https:\/\/arxiv.org\/abs\/2309.07918"},{"key":"e_1_3_2_1_46_1","unstructured":"Haiwei Xue Xiangyang Luo Zhanghao Hu Xin Zhang Xunzhi Xiang Yuqin Dai Jianzhuang Liu Zhensong Zhang Minglei Li Jian Yang et al. 2024. Human motion video generation: A survey. Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"key":"e_1_3_2_1_47_1","volume-title":"Diffusestylegesture: Stylized audio-driven co-speech gesture generation with diffusion models. arXiv preprint arXiv:2305.04919","author":"Yang Sicheng","year":"2023","unstructured":"Sicheng Yang, Zhiyong Wu, Minglei Li, Zhensong Zhang, Lei Hao, Weihong Bao, Ming Cheng, and Long Xiao. 2023. Diffusestylegesture: Stylized audio-driven co-speech gesture generation with diffusion models. arXiv preprint arXiv:2305.04919 (2023)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00053"},{"key":"e_1_3_2_1_49_1","volume-title":"European Conference on Computer Vision. Springer, 246-263","author":"Yi Hongwei","year":"2024","unstructured":"Hongwei Yi, Justus Thies, Michael J Black, Xue Bin Peng, and Davis Rempe. 2024. Generating human interaction motions in scenes with text control. In European Conference on Computer Vision. Springer, 246-263."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58545-7_20"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01415"},{"key":"e_1_3_2_1_52_1","volume-title":"Motiondiffuse: Text-driven human motion generation with diffusion model","author":"Zhang Mingyuan","year":"2024","unstructured":"Mingyuan Zhang, Zhongang Cai, Liang Pan, Fangzhou Hong, Xinying Guo, Lei Yang, and Ziwei Liu. 2024a. Motiondiffuse: Text-driven human motion generation with diffusion model. IEEE transactions on pattern analysis and machine intelligence, Vol. 46, 6 (2024), 4115-4128."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00040"},{"key":"e_1_3_2_1_54_1","volume-title":"Eduardo P\u00e9rez Pellitero, and Gerard Pons-Moll","author":"Zhang Xiaohan","year":"2024","unstructured":"Xiaohan Zhang, Sebastian Starke, Vladimir Guzov, Zhensong Zhang, Eduardo P\u00e9rez Pellitero, and Gerard Pons-Moll. 2024c. SCENIC: Scene-aware Semantic Navigation with Instruction-guided Control. arXiv preprint arXiv:2412.15664 (2024)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00623"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28567"},{"key":"e_1_3_2_1_57_1","volume-title":"Automatic chain of thought prompting in large language models. arXiv preprint arXiv:2210.03493","author":"Zhang Zhuosheng","year":"2022","unstructured":"Zhuosheng Zhang, Aston Zhang, Mu Li, and Alex Smola. 2022. Automatic chain of thought prompting in large language models. arXiv preprint arXiv:2210.03493 (2022)."},{"key":"e_1_3_2_1_58_1","volume-title":"TopV-Nav: Unlocking the Top-View Spatial Reasoning Potential of MLLM for Zero-shot Object Navigation. arXiv preprint arXiv:2411.16425","author":"Zhong Linqing","year":"2024","unstructured":"Linqing Zhong, Chen Gao, Zihan Ding, Yue Liao, and Si Liu. 2024. TopV-Nav: Unlocking the Top-View Spatial Reasoning Potential of MLLM for Zero-shot Object Navigation. arXiv preprint arXiv:2411.16425 (2024)."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00589"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754850","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:35:58Z","timestamp":1765308958000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754850"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":59,"alternative-id":["10.1145\/3746027.3754850","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754850","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}