{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:05:59Z","timestamp":1750309559838,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T00:00:00Z","timestamp":1732147200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,11,21]]},"DOI":"10.1145\/3708657.3708774","type":"proceedings-article","created":{"date-parts":[[2025,5,29]],"date-time":"2025-05-29T11:17:46Z","timestamp":1748517466000},"page":"728-735","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Language-guided Human Motion Generation with Temporal Contact Diffusion in 3D Scenes"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-1522-1414","authenticated-orcid":false,"given":"Qiuzi","family":"Huang","sequence":"first","affiliation":[{"name":"School of Artificial Intellegence, Beijing University of Posts and Telecommunications, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2461-3480","authenticated-orcid":false,"given":"Jiaxin","family":"Tong","sequence":"additional","affiliation":[{"name":"School of Digital Media and Design Arts, Beijing University of Posts and Telecommunications, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4879-0887","authenticated-orcid":false,"given":"Mingtong","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Digital Media and Design Arts, Beijing University of Posts and Telecommunications, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9814-638X","authenticated-orcid":false,"given":"Yang","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Digital Media and Design Arts, Beijing University of Posts and Telecommunications, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1058-2799","authenticated-orcid":false,"given":"Xueming","family":"Li","sequence":"additional","affiliation":[{"name":"School of Electronic Engineering, Beijing University of Posts and Telecommunications, Beijing, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,5,29]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_25"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"crossref","unstructured":"Tenglong Ao Zeyi Zhang and Libin Liu. 2023. Gesturediffuclip: Gesture diffusion model with clip latents. ACM Transactions on Graphics (TOG) 42 4 (2023) 1\u201318.","DOI":"10.1145\/3592097"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02032"},{"key":"e_1_3_3_1_5_2","unstructured":"Iz Beltagy Matthew\u00a0E Peters and Arman Cohan. 2020. Longformer: The long-document transformer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2004.05150 (2020)."},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58601-0_22"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00062"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_23"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01726"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00941"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.14739"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01447"},{"key":"e_1_3_3_1_14_2","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems 33 (2020) 6840\u20136851."},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01607"},{"key":"e_1_3_3_1_16_2","first-page":"4651","volume-title":"International conference on machine learning","author":"Jaegle Andrew","year":"2021","unstructured":"Andrew Jaegle, Felix Gimeno, Andy Brock, Oriol Vinyals, Andrew Zisserman, and Joao Carreira. 2021. Perceiver: General perception with iterative attention. In International conference on machine learning. PMLR, 4651\u20134664."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01092"},{"key":"e_1_3_3_1_18_2","unstructured":"Diederik\u00a0P Kingma. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1412.6980 (2014)."},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"Jiaman Li Jiajun Wu and C\u00a0Karen Liu. 2023. Object motion guided human motion synthesis. ACM Transactions on Graphics (TOG) 42 6 (2023) 1\u201311.","DOI":"10.1145\/3618333"},{"key":"e_1_3_3_1_20_2","unstructured":"Qiujing Lu Yipeng Zhang Mingjian Lu and Vwani Roychowdhury. 2022. Action-conditioned On-demand Motion Generation. (Jul 2022)."},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00554"},{"key":"e_1_3_3_1_22_2","unstructured":"Wei Mao Richard\u00a0I Hartley Mathieu Salzmann et\u00a0al. 2022. Contact-aware human motion forecasting. Advances in Neural Information Processing Systems 35 (2022) 7356\u20137367."},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.01080"},{"key":"e_1_3_3_1_24_2","first-page":"4332","volume-title":"Proceedings of the IEEE\/CVF international conference on computer vision","author":"Prokudin Sergey","year":"2019","unstructured":"Sergey Prokudin, Christoph Lassner, and Javier Romero. 2019. Efficient learning on point clouds with basis point sets. In Proceedings of the IEEE\/CVF international conference on computer vision. 4332\u20134341."},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3610543.3626176"},{"key":"e_1_3_3_1_26_2","unstructured":"Sigal Raab Inbal Leibovitch Guy Tevet Moab Arar Amit\u00a0H Bermano and Daniel Cohen-Or. 2023. Single Motion Diffusion. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.05905 (2023)."},{"key":"e_1_3_3_1_27_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_3_1_28_2","unstructured":"Yonatan Shafir Guy Tevet Roy Kapon and Amit\u00a0H Bermano. 2023. Human motion diffusion as a generative prior. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.01418 (2023)."},{"key":"e_1_3_3_1_29_2","unstructured":"Jiaming Song Chenlin Meng and Stefano Ermon. 2020. Denoising diffusion implicit models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.02502 (2020)."},{"key":"e_1_3_3_1_30_2","unstructured":"Yang Song Jascha Sohl-Dickstein Diederik\u00a0P Kingma Abhishek Kumar Stefano Ermon and Ben Poole. 2020. Score-based generative modeling through stochastic differential equations. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2011.13456 (2020)."},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01291"},{"key":"e_1_3_3_1_32_2","unstructured":"Guy Tevet Sigal Raab Brian Gordon Yonatan Shafir Daniel Cohen-Or and Amit\u00a0H Bermano. 2022. Human motion diffusion model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2209.14916 (2022)."},{"key":"e_1_3_3_1_33_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01981"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00928"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"crossref","unstructured":"Zan Wang Yixin Chen Baoxiong Jia Puhao Li Jinlu Zhang Jingze Zhang Tengyu Liu Yixin Zhu Wei Liang and Siyuan Huang. 2024. Move as You Say Interact as You Can: Language-guided Human Motion Generation with Scene Affordance. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.18036 (2024).","DOI":"10.1109\/CVPR52733.2024.00049"},{"key":"e_1_3_3_1_37_2","unstructured":"Zan Wang Yixin Chen Tengyu Liu Yixin Zhu Wei Liang and Siyuan Huang. 2022. Humanise: Language-conditioned human motion generation in 3d scenes. Advances in Neural Information Processing Systems 35 (2022) 14959\u201314971."},{"key":"e_1_3_3_1_38_2","first-page":"618","volume-title":"Conference on Robot Learning","author":"Wu Yueh-Hua","year":"2023","unstructured":"Yueh-Hua Wu, Jiashun Wang, and Xiaolong Wang. 2023. Learning generalizable dexterous manipulation from human grasp affordance. In Conference on Robot Learning. PMLR, 618\u2013629."},{"key":"e_1_3_3_1_39_2","unstructured":"Zeqi Xiao Tai Wang Jingbo Wang Jinkun Cao Wenwei Zhang Bo Dai Dahua Lin and Jiangmiao Pang. 2023. Unified human-scene interaction via prompted chain-of-contacts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.07918 (2023)."},{"key":"e_1_3_3_1_40_2","unstructured":"Weihao Yuan Weichao Shen Yisheng He Yuan Dong Xiaodong Gu Zilong Dong Liefeng Bo and Qixing Huang. 2024. MoGenTS: Motion Generation based on Spatial-Temporal Joint Modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.17686 (2024)."},{"key":"e_1_3_3_1_41_2","unstructured":"Mingyuan Zhang Zhongang Cai Liang Pan Fangzhou Hong Xinying Guo Lei Yang and Ziwei Liu. 2022. Motiondiffuse: Text-driven human motion generation with diffusion model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2208.15001 (2022)."},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/3DV50981.2020.00074"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20065-6_30"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00623"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01595"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00589"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01016"}],"event":{"name":"ICCIP 2024: 2024 the 10th International Conference on Communication and Information Processing","acronym":"ICCIP 2024","location":"Lingshui Hainan China"},"container-title":["Proceedings of the 2024 10th International Conference on Communication and Information Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3708657.3708774","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3708657.3708774","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:58Z","timestamp":1750295938000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3708657.3708774"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,21]]},"references-count":46,"alternative-id":["10.1145\/3708657.3708774","10.1145\/3708657"],"URL":"https:\/\/doi.org\/10.1145\/3708657.3708774","relation":{},"subject":[],"published":{"date-parts":[[2024,11,21]]},"assertion":[{"value":"2025-05-29","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}