{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,31]],"date-time":"2025-12-31T12:17:21Z","timestamp":1767183441295,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":72,"publisher":"ACM","funder":[{"name":"The STI 2030-Major Projects","award":["2021ZD0201404"],"award-info":[{"award-number":["2021ZD0201404"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755097","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:50:47Z","timestamp":1761371447000},"page":"9743-9752","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Separate to Collaborate: Dual-Stream Diffusion Model for Coordinated Piano Hand Motion Synthesis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-7228-2792","authenticated-orcid":false,"given":"Zihao","family":"Liu","sequence":"first","affiliation":[{"name":"Tsinghua University, Shenzhen, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9743-2463","authenticated-orcid":false,"given":"Mingwen","family":"Ou","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5586-4971","authenticated-orcid":false,"given":"Zunnan","family":"Xu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0030-6555","authenticated-orcid":false,"given":"Jiaqi","family":"Huang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4894-3860","authenticated-orcid":false,"given":"Haonan","family":"Han","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7703-9315","authenticated-orcid":false,"given":"Ronghui","family":"Li","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0403-1923","authenticated-orcid":false,"given":"Xiu","family":"Li","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592458"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1177\/0278364919887447"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680847"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02110"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591528"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01726"},{"key":"e_1_3_2_2_7_1","volume-title":"Diffusion models beat gans on image synthesis. Advances in neural information processing systems","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. Advances in neural information processing systems, Vol. 34 (2021), 8780-8794."},{"key":"e_1_3_2_2_8_1","volume-title":"PianoMotion10M: Dataset and Benchmark for Hand Motion Generation in Piano Performance. arXiv preprint arXiv:2406.09326","author":"Gan Qijun","year":"2024","unstructured":"Qijun Gan, Song Wang, Shengtao Wu, and Jianke Zhu. 2024. PianoMotion10M: Dataset and Benchmark for Hand Motion Generation in Piano Performance. arXiv preprint arXiv:2406.09326 (2024)."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00186"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_34"},{"key":"e_1_3_2_2_11_1","volume-title":"AToM: Aligning Text-to-Motion Model at Event-Level with GPT-4Vision Reward. arXiv preprint arXiv:2411.18654","author":"Han Haonan","year":"2024","unstructured":"Haonan Han, Xiangzuo Wu, Huan Liao, Zunnan Xu, Zhongyuan Hu, Ronghui Li, Yachao Zhang, and Xiu Li. 2024a. AToM: Aligning Text-to-Motion Model at Event-Level with GPT-4Vision Reward. arXiv preprint arXiv:2411.18654 (2024)."},{"key":"e_1_3_2_2_12_1","volume-title":"Reparo: Compositional 3d assets generation with differentiable 3d layout alignment. arXiv preprint arXiv:2405.18525","author":"Han Haonan","year":"2024","unstructured":"Haonan Han, Rui Yang, Huan Liao, Jiankai Xing, Zunnan Xu, Xiaoming Yu, Junwei Zha, Xiu Li, and Wanhua Li. 2024b. Reparo: Compositional 3d assets generation with differentiable 3d layout alignment. arXiv preprint arXiv:2405.18525 (2024)."},{"key":"e_1_3_2_2_13_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems, Vol. 33 (2020), 6840-6851."},{"key":"e_1_3_2_2_14_1","volume-title":"Audio-visual Controlled Video Diffusion with Masked Selective State Spaces Modeling for Natural Talking Head Generation. arXiv preprint arXiv:2504.02542","author":"Hong Fa-Ting","year":"2025","unstructured":"Fa-Ting Hong, Zunnan Xu, Zixiang Zhou, Jun Zhou, Xiu Li, Qin Lin, Qinglin Lu, and Dan Xu. 2025. Audio-visual Controlled Video Diffusion with Masked Selective State Spaces Modeling for Natural Talking Head Generation. arXiv preprint arXiv:2504.02542 (2025)."},{"key":"e_1_3_2_2_15_1","volume-title":"Kushal Lakhotia","author":"Hsu Wei-Ning","year":"2021","unstructured":"Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM transactions on audio, speech, and language processing, Vol. 29 (2021), 3451-3460."},{"key":"e_1_3_2_2_16_1","first-page":"20067","article-title":"Motiongpt: Human motion as a foreign language","volume":"36","author":"Jiang Biao","year":"2023","unstructured":"Biao Jiang, Xin Chen, Wen Liu, Jingyi Yu, Gang Yu, and Tao Chen. 2023. Motiongpt: Human motion as a foreign language. Advances in Neural Information Processing Systems, Vol. 36 (2023), 20067-20079.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_17_1","first-page":"15497","article-title":"Act as you wish: Fine-grained control of motion diffusion model with hierarchical semantic graphs","volume":"36","author":"Jin Peng","year":"2023","unstructured":"Peng Jin, Yang Wu, Yanbo Fan, Zhongqian Sun, Wei Yang, and Li Yuan. 2023. Act as you wish: Fine-grained control of motion diffusion model with hierarchical semantic graphs. Advances in Neural Information Processing Systems, Vol. 36 (2023), 15497-15518.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_18_1","volume-title":"Alignment is All You Need: A Training-free Augmentation Strategy for Pose-guided Video Generation. arXiv preprint arXiv:2408.16506","author":"Jin Xiaoyu","year":"2024","unstructured":"Xiaoyu Jin, Zunnan Xu, Mingwen Ou, and Wenming Yang. 2024. Alignment is All You Need: A Training-free Augmentation Strategy for Pose-guided Video Generation. arXiv preprint arXiv:2408.16506 (2024)."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00205"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00361"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01360"},{"key":"e_1_3_2_2_22_1","volume-title":"AAMAS'02 Workshop Embodied conversational agents-let's specify and evaluate them!","author":"Kranstedt Alfred","year":"2002","unstructured":"Alfred Kranstedt, Stefan Kopp, and Ipke Wachsmuth. 2002. Murml: A multimodal utterance representation markup language for conversational agents. In AAMAS'02 Workshop Embodied conversational agents-let's specify and evaluate them!"},{"key":"e_1_3_2_2_23_1","volume-title":"Robert William Boyd, et al","author":"Laplante Philip A","year":"2018","unstructured":"Philip A Laplante, Robin Cravey, Lawrence P Dunleavy, James L Antonakos, Rodney LeRoy, Jack East, Nicholas E Buris, Christopher J Conant, Lawrence Fryda, Robert William Boyd, et al., 2018. Comprehensive dictionary of electrical engineering. CRC Press."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00057"},{"key":"e_1_3_2_2_25_1","volume-title":"DisPose: Disentangling Pose Guidance for Controllable Human Image Animation. arXiv preprint arXiv:2412.09349","author":"Li Hongxiang","year":"2024","unstructured":"Hongxiang Li, Yaowei Li, Yuhang Yang, Junjie Cao, Zhihong Zhu, Xuxin Cheng, and Long Chen. 2024b. DisPose: Disentangling Pose Guidance for Controllable Human Image Animation. arXiv preprint arXiv:2412.09349 (2024)."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447825"},{"key":"e_1_3_2_2_27_1","volume-title":"2024 e. Lodge: High-quality and Long Dance Generation with Vivid Choreography Patterns. arXiv preprint arXiv:2410.20389","author":"Li Ronghui","year":"2024","unstructured":"Ronghui Li, Hongwen Zhang, Yachao Zhang, Yuxiang Zhang, Youliang Zhang, Jie Guo, Yan Zhang, Xiu Li, and Yebin Liu. 2024 e. Lodge: High-quality and Long Dance Generation with Vivid Choreography Patterns. arXiv preprint arXiv:2410.20389 (2024)."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00151"},{"key":"e_1_3_2_2_29_1","volume-title":"InterDance: Reactive 3D Dance Generation with Realistic Duet Interactions. arXiv preprint arXiv:2412.16982","author":"Li Ronghui","year":"2024","unstructured":"Ronghui Li, Youliang Zhang, Yachao Zhang, Yuxiang Zhang, Mingyang Su, Jie Guo, Ziwei Liu, Yebin Liu, and Xiu Li. 2024d. InterDance: Reactive 3D Dance Generation with Realistic Duet Interactions. arXiv preprint arXiv:2412.16982 (2024)."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00939"},{"key":"e_1_3_2_2_31_1","volume-title":"Guosheng Yin, and Xiu Li.","author":"Lin Yukang","year":"2025","unstructured":"Yukang Lin, Hokit Fung, Jianjin Xu, Zeping Ren, Adela SM Lau, Guosheng Yin, and Xiu Li. 2025. MVPortrait: Text-Guided Motion and Emotion Control for Multi-view Vivid Portrait Animation. arXiv preprint arXiv:2503.19383 (2025)."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680994"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.5555\/1632592.1632616"},{"key":"e_1_3_2_2_34_1","volume-title":"HumanTOMATO: Text-aligned Whole-body Motion Generation. In International Conference on Machine Learning. PMLR, 32939-32977","author":"Lu Shunlin","year":"2024","unstructured":"Shunlin Lu, Ling-Hao Chen, Ailing Zeng, Jing Lin, Ruimao Zhang, Lei Zhang, and Heung-Yeung Shum. 2024. HumanTOMATO: Text-aligned Whole-body Motion Generation. In International Conference on Machine Learning. PMLR, 32939-32977."},{"key":"e_1_3_2_2_35_1","volume-title":"Adversarial Distribution Matching for Diffusion Distillation Towards Efficient Image and Video Synthesis. arXiv preprint arXiv:2507.18569","author":"Lu Yanzuo","year":"2025","unstructured":"Yanzuo Lu, Yuxi Ren, Xin Xia, Shanchuan Lin, Xing Wang, Xuefeng Xiao, Andy J. Ma, Xiaohua Xie, and Jian-Huang Lai. 2025. Adversarial Distribution Matching for Diffusion Distillation Towards Efficient Image and Video Synthesis. arXiv preprint arXiv:2507.18569 (2025)."},{"key":"e_1_3_2_2_36_1","volume-title":"Latent consistency models: Synthesizing high-resolution images with few-step inference. arXiv preprint arXiv:2310.04378","author":"Luo Simian","year":"2023","unstructured":"Simian Luo, Yiqin Tan, Longbo Huang, Jian Li, and Hang Zhao. 2023. Latent consistency models: Synthesizing high-resolution images with few-step inference. arXiv preprint arXiv:2310.04378 (2023)."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i6.32633"},{"key":"e_1_3_2_2_38_1","volume-title":"MS-DETR: Towards Effective Video Moment Retrieval and Highlight Detection by Joint Motion-Semantic Learning. arXiv preprint arXiv:2507.12062","author":"Ma Hongxu","year":"2025","unstructured":"Hongxu Ma, Guanshuo Wang, Fufu Yu, Qiong Jia, and Shouhong Ding. 2025a. MS-DETR: Towards Effective Video Moment Retrieval and Highlight Detection by Joint Motion-Semantic Learning. arXiv preprint arXiv:2507.12062 (2025)."},{"key":"e_1_3_2_2_39_1","volume-title":"Fine-Grained Zero-Shot Object Detection. arXiv preprint arXiv:2507.10358","author":"Ma Hongxu","year":"2025","unstructured":"Hongxu Ma, Chenbo Zhang, Lu Zhang, Jiaogen Zhou, Jihong Guan, and Shuigeng Zhou. 2025b. Fine-Grained Zero-Shot Object Detection. arXiv preprint arXiv:2507.10358 (2025)."},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01975"},{"key":"e_1_3_2_2_41_1","volume-title":"Proceedings of the ACM SIGGRAPH\/Eurographics symposium on computer animation. 137-144","author":"Mordatch Igor","year":"2012","unstructured":"Igor Mordatch, Zoran Popovi\u0107, and Emanuel Todorov. 2012. Contact-invariant optimization for hand manipulation. In Proceedings of the ACM SIGGRAPH\/Eurographics symposium on computer animation. 137-144."},{"key":"e_1_3_2_2_42_1","volume-title":"International conference on machine learning. PMLR, 8162-8171","author":"Nichol Alexander Quinn","year":"2021","unstructured":"Alexander Quinn Nichol and Prafulla Dhariwal. 2021. Improved denoising diffusion probabilistic models. In International conference on machine learning. PMLR, 8162-8171."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01891"},{"key":"e_1_3_2_2_44_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125, Vol. 1, 2 (2022), 3."},{"key":"e_1_3_2_2_45_1","volume-title":"Automating the production of communicative gestures in embodied characters. Frontiers in psychology","author":"Ravenet Brian","year":"2018","unstructured":"Brian Ravenet, Catherine Pelachaud, Chlo\u00e9 Clavel, and Stacy Marsella. 2018. Automating the production of communicative gestures in embodied characters. Frontiers in psychology, Vol. 9 (2018), 1144."},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3130800.3130883"},{"key":"e_1_3_2_2_47_1","volume-title":"Carlo Tomasi, and Leonidas J Guibas","author":"Rubner Yossi","year":"2000","unstructured":"Yossi Rubner, Carlo Tomasi, and Leonidas J Guibas. 2000. The earth mover's distance as a metric for image retrieval. International journal of computer vision, Vol. 40 (2000), 99-121."},{"key":"e_1_3_2_2_48_1","volume-title":"Progressive distillation for fast sampling of diffusion models. arXiv preprint arXiv:2202.00512","author":"Salimans Tim","year":"2022","unstructured":"Tim Salimans and Jonathan Ho. 2022. Progressive distillation for fast sampling of diffusion models. arXiv preprint arXiv:2202.00512 (2022)."},{"key":"e_1_3_2_2_49_1","volume-title":"Denoising Diffusion Implicit Models. In International Conference on Learning Representations.","author":"Song Jiaming","year":"2021","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2021. Denoising Diffusion Implicit Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_2_50_1","volume-title":"Human Motion Diffusion Model. In The Eleventh International Conference on Learning Representations.","author":"Tevet Guy","year":"2023","unstructured":"Guy Tevet, Sigal Raab, Brian Gordon, Yoni Shafir, Daniel Cohen-or, and Amit Haim Bermano. 2023. Human Motion Diffusion Model. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_2_51_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_2_52_1","volume-title":"F\u00fcrElise: Capturing and Physically Synthesizing Hand Motion of Piano Performance. In SIGGRAPH Asia 2024 Conference Papers. 1-11","author":"Wang Ruocheng","year":"2024","unstructured":"Ruocheng Wang, Pei Xu, Haochen Shi, Elizabeth Schumann, and C Karen Liu. 2024. F\u00fcrElise: Capturing and Physically Synthesizing Hand Motion of Piano Performance. In SIGGRAPH Asia 2024 Conference Papers. 1-11."},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02014"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/2508363.2508413"},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3606931"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS47612.2022.9981221"},{"key":"e_1_3_2_2_57_1","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems.","author":"Xu Zunnan","year":"2024","unstructured":"Zunnan Xu, Yukang Lin, Haonan Han, Sicheng Yang, Ronghui Li, Yachao Zhang, and Xiu Li. 2024a. Mambatalk: Efficient holistic gesture synthesis with selective state space models. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_2_58_1","volume-title":"Hunyuanportrait: Implicit condition control for enhanced portrait animation. arXiv preprint arXiv:2503.18860","author":"Xu Zunnan","year":"2025","unstructured":"Zunnan Xu, Zhentao Yu, Zixiang Zhou, Jun Zhou, Xiaoyu Jin, Fa-Ting Hong, Xiaozhong Ji, Junwei Zhu, Chengfei Cai, Shiyu Tang, et al., 2025. Hunyuanportrait: Implicit condition control for enhanced portrait animation. arXiv preprint arXiv:2503.18860 (2025)."},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28458"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612503"},{"key":"e_1_3_2_2_61_1","volume-title":"Diffusestylegesture: Stylized audio-driven co-speech gesture generation with diffusion models. arXiv preprint arXiv:2305.04919","author":"Yang Sicheng","year":"2023","unstructured":"Sicheng Yang, Zhiyong Wu, Minglei Li, Zhensong Zhang, Lei Hao, Weihong Bao, Ming Cheng, and Long Xiao. 2023c. Diffusestylegesture: Stylized audio-driven co-speech gesture generation with diffusion models. arXiv preprint arXiv:2305.04919 (2023)."},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00230"},{"key":"e_1_3_2_2_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447978"},{"key":"e_1_3_2_2_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530057"},{"key":"e_1_3_2_2_65_1","volume-title":"Differential transformer. arXiv preprint arXiv:2410.05258","author":"Ye Tianzhu","year":"2024","unstructured":"Tianzhu Ye, Li Dong, Yuqing Xia, Yutao Sun, Yi Zhu, Gao Huang, and Furu Wei. 2024. Differential transformer. arXiv preprint arXiv:2410.05258 (2024)."},{"key":"e_1_3_2_2_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01467"},{"key":"e_1_3_2_2_67_1","volume-title":"RoboPianist: Dexterous Piano Playing with Deep Reinforcement Learning. In Conference on Robot Learning (CoRL).","author":"Zakka Kevin","year":"2023","unstructured":"Kevin Zakka, Philipp Wu, Laura Smith, Nimrod Gileadi, Taylor Howell, Xue Bin Peng, Sumeet Singh, Yuval Tassa, Pete Florence, Andy Zeng, and Pieter Abbeel. 2023. RoboPianist: Dexterous Piano Playing with Deep Reinforcement Learning. In Conference on Robot Learning (CoRL)."},{"key":"e_1_3_2_2_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/3478513.3480500"},{"key":"e_1_3_2_2_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01415"},{"key":"e_1_3_2_2_70_1","volume-title":"Motiondiffuse: Text-driven human motion generation with diffusion model","author":"Zhang Mingyuan","year":"2024","unstructured":"Mingyuan Zhang, Zhongang Cai, Liang Pan, Fangzhou Hong, Xinying Guo, Lei Yang, and Ziwei Liu. 2024. Motiondiffuse: Text-driven human motion generation with diffusion model. IEEE transactions on pattern analysis and machine intelligence, Vol. 46, 6 (2024), 4115-4128."},{"key":"e_1_3_2_2_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01902"},{"key":"e_1_3_2_2_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00053"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755097","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:21:20Z","timestamp":1765308080000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755097"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":72,"alternative-id":["10.1145\/3746027.3755097","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755097","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}