{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,8]],"date-time":"2026-02-08T04:19:37Z","timestamp":1770524377230,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100018537","name":"National Science and Technology Major Project","doi-asserted-by":"publisher","award":["2022ZD0114900"],"award-info":[{"award-number":["2022ZD0114900"]}],"id":[{"id":"10.13039\/501100018537","id-type":"DOI","asserted-by":"publisher"}]},{"name":"NSFC","award":["62376009"],"award-info":[{"award-number":["62376009"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,3]]},"DOI":"10.1145\/3680528.3687595","type":"proceedings-article","created":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T08:14:37Z","timestamp":1733213677000},"page":"1-11","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":12,"title":["Autonomous Character-Scene Interaction Synthesis from Text Instruction"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-5726-7672","authenticated-orcid":false,"given":"Nan","family":"Jiang","sequence":"first","affiliation":[{"name":"Institute for AI, Peking University, Beijing, China and National Key Lab of General AI, BIGAI, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2330-559X","authenticated-orcid":false,"given":"Zimo","family":"He","sequence":"additional","affiliation":[{"name":"Institute for AI, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3327-7700","authenticated-orcid":false,"given":"Zi","family":"Wang","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0695-1008","authenticated-orcid":false,"given":"Hongjie","family":"Li","sequence":"additional","affiliation":[{"name":"Institute for AI, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8176-0241","authenticated-orcid":false,"given":"Yixin","family":"Chen","sequence":"additional","affiliation":[{"name":"National Key Lab of General AI, BIGAI, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1524-7148","authenticated-orcid":false,"given":"Siyuan","family":"Huang","sequence":"additional","affiliation":[{"name":"National Key Lab of General AI, BIGAI, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7024-1545","authenticated-orcid":false,"given":"Yixin","family":"Zhu","sequence":"additional","affiliation":[{"name":"Institute for AI, Peking University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,12,3]]},"reference":[{"key":"e_1_3_3_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02032"},{"key":"e_1_3_3_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01547"},{"key":"e_1_3_3_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657440"},{"key":"e_1_3_3_1_5_1","doi-asserted-by":"crossref","unstructured":"Stelian Coros Philippe Beaudoin and Michiel Van\u00a0de Panne. 2010. Generalized biped walking control. ACM Transactions on Graphics (TOG) 29 4 (2010) 1\u20139.","DOI":"10.1145\/1778765.1781156"},{"key":"e_1_3_3_1_6_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01244"},{"key":"e_1_3_3_1_8_1","volume-title":"Eurographics","author":"Ghosh Anindita","year":"2023","unstructured":"Anindita Ghosh, Rishabh Dabral, Vladislav Golyanik, Christian Theobalt, and Philipp Slusallek. 2023. IMoS: Intent-Driven Full-Body Motion Synthesis for Human-Object Interactions. In Eurographics."},{"key":"e_1_3_3_1_9_1","volume-title":"International Conference on 3D Vision (3DV)","author":"Guzov Vladimir","year":"2023","unstructured":"Vladimir Guzov, Julian Chibane, Riccardo Marin, Yannan He, Yunus Saracoglu, Torsten Sattler, and Gerard Pons-Moll. 2023. Interaction Replica: Tracking human\u2013object interaction and scene changes from human motion. In International Conference on 3D Vision (3DV)."},{"key":"e_1_3_3_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01118"},{"key":"e_1_3_3_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00237"},{"key":"e_1_3_3_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591525"},{"key":"e_1_3_3_1_13_1","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"He Chengan","year":"2022","unstructured":"Chengan He, Jun Saito, James Zachary, Holly Rushmeier, and Yi Zhou. 2022. Nemf: Neural motion fields for kinematic animation. In Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_1_14_1","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. In Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01607"},{"key":"e_1_3_3_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00859"},{"key":"e_1_3_3_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00171"},{"key":"e_1_3_3_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00886"},{"key":"e_1_3_3_1_19_1","volume-title":"European Conference on Computer Vision (ECCV)","author":"Li Jiaman","year":"2023","unstructured":"Jiaman Li, Alexander Clegg, Roozbeh Mottaghi, Jiajun Wu, Xavier Puig, and C\u00a0Karen Liu. 2023a. Controllable human-object interaction synthesis. In European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_3_1_20_1","doi-asserted-by":"crossref","unstructured":"Jiaman Li Jiajun Wu and C\u00a0Karen Liu. 2023b. Object motion guided human motion synthesis. ACM Transactions on Graphics (TOG) 42 6 (2023) 1\u201311.","DOI":"10.1145\/3618333"},{"key":"e_1_3_3_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01265"},{"key":"e_1_3_3_1_22_1","doi-asserted-by":"crossref","unstructured":"Libin Liu and Jessica Hodgins. 2017. Learning to schedule control fragments for physics-based characters using deep q-learning. ACM Transactions on Graphics (TOG) 36 3 (2017) 1\u201314.","DOI":"10.1145\/3083723"},{"key":"e_1_3_3_1_23_1","doi-asserted-by":"crossref","unstructured":"Libin Liu and Jessica Hodgins. 2018. Learning basketball dribbling skills using trajectory optimization and deep reinforcement learning. ACM Transactions on Graphics (TOG) 37 4 (2018) 1\u201314.","DOI":"10.1145\/3197517.3201315"},{"key":"e_1_3_3_1_24_1","doi-asserted-by":"crossref","unstructured":"Josh Merel Saran Tunyasuvunakool Arun Ahuja Yuval Tassa Leonard Hasenclever Vu Pham Tom Erez Greg Wayne and Nicolas Heess. 2020. Catch & carry: reusable neural controllers for vision-guided whole-body tasks. ACM Transactions on Graphics (TOG) 39 4 (2020) 39\u20131.","DOI":"10.1145\/3386569.3392474"},{"key":"e_1_3_3_1_25_1","doi-asserted-by":"crossref","unstructured":"Aron Monszpart Paul Guerrero Duygu Ceylan Ersin Yumer and Niloy\u00a0J Mitra. 2019. iMapper: interaction-guided scene mapping from monocular videos. ACM Transactions on Graphics (TOG) 38 4 (2019) 1\u201315.","DOI":"10.1145\/3306346.3322961"},{"key":"e_1_3_3_1_26_1","doi-asserted-by":"crossref","unstructured":"Igor Mordatch Emanuel Todorov and Zoran Popovi\u0107. 2012. Discovery of complex behaviors through contact-invariant optimization. ACM Transactions on Graphics (TOG) 31 4 (2012) 1\u20138.","DOI":"10.1145\/2185520.2335394"},{"key":"e_1_3_3_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01123"},{"key":"e_1_3_3_1_28_1","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Peng Xue\u00a0Bin","year":"2019","unstructured":"Xue\u00a0Bin Peng, Michael Chang, Grace Zhang, Pieter Abbeel, and Sergey Levine. 2019. Mcp: Learning composable hierarchical control with multiplicative compositional policies. In Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_1_29_1","volume-title":"International Conference on Machine Learning (ICML)","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_3_1_30_1","doi-asserted-by":"crossref","unstructured":"Manolis Savva Angel\u00a0X Chang Pat Hanrahan Matthew Fisher and Matthias Nie\u00dfner. 2016. Pigraphs: learning interaction snapshots from observations. ACM Transactions on Graphics (TOG) 35 4 (2016) 1\u201312.","DOI":"10.1145\/2897824.2925867"},{"key":"e_1_3_3_1_31_1","doi-asserted-by":"crossref","unstructured":"Sebastian Starke He Zhang Taku Komura and Jun Saito. 2019. Neural state machine for character-scene interactions. ACM Transactions on Graphics (TOG) 38 6 (2019) 178.","DOI":"10.1145\/3355089.3356505"},{"key":"e_1_3_3_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01291"},{"key":"e_1_3_3_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01291"},{"key":"e_1_3_3_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_34"},{"key":"e_1_3_3_1_35_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Tevet Guy","year":"2022","unstructured":"Guy Tevet, Sigal Raab, Brian Gordon, Yoni Shafir, Daniel Cohen-or, and Amit\u00a0Haim Bermano. 2022. Human Motion Diffusion Model. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01249-6_37"},{"key":"e_1_3_3_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00928"},{"key":"e_1_3_3_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01203"},{"key":"e_1_3_3_1_39_1","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Wang Zan","year":"2022","unstructured":"Zan Wang, Yixin Chen, Tengyu Liu, Yixin Zhu, Wei Liang, and Siyuan Huang. 2022. Humanise: Language-conditioned human motion generation in 3d scenes. In Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20068-7_15"},{"key":"e_1_3_3_1_41_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Xiao Zeqi","year":"2024","unstructured":"Zeqi Xiao, Tai Wang, Jingbo Wang, Jinkun Cao, Wenwei Zhang, Bo Dai, Dahua Lin, and Jiangmiao Pang. 2024. Unified Human-Scene Interaction via Prompted Chain-of-Contacts. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_1_42_1","volume-title":"Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Yi Hongwei","year":"2024","unstructured":"Hongwei Yi, Justus Thies, Michael\u00a0J Black, Xue\u00a0Bin Peng, and Davis Rempe. 2024. Generating Human Interaction Motions in Scenes with Text Control. In Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_3_1_43_1","volume-title":"International Conference on 3D Vision (3DV)","author":"Zhang Siwei","year":"2020","unstructured":"Siwei Zhang, Yan Zhang, Qianli Ma, Michael\u00a0J Black, and Siyu Tang. 2020b. Generating person-scene interactions in 3d scenes. In International Conference on 3D Vision (3DV)."},{"key":"e_1_3_3_1_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20065-6_30"},{"key":"e_1_3_3_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00623"},{"key":"e_1_3_3_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01983"},{"key":"e_1_3_3_1_47_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20068-7_18"},{"key":"e_1_3_3_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01354"}],"event":{"name":"SA '24: SIGGRAPH Asia 2024 Conference Papers","location":"Tokyo Japan","acronym":"SA '24","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["SIGGRAPH Asia 2024 Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3680528.3687595","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3680528.3687595","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:58:26Z","timestamp":1750294706000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3680528.3687595"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"references-count":47,"alternative-id":["10.1145\/3680528.3687595","10.1145\/3680528"],"URL":"https:\/\/doi.org\/10.1145\/3680528.3687595","relation":{},"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"2024-12-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}