{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:25:19Z","timestamp":1778081119412,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"National Key R&D Program of China","award":["2022ZD0115502"],"award-info":[{"award-number":["2022ZD0115502"]}]},{"name":"CCF-DiDi GAIA Collaborative Research Funds"},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62122010"],"award-info":[{"award-number":["62122010"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612307","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"1374-1382","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":29,"title":["DiffDance: Cascaded Human Motion Diffusion Model for Dance Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9977-967X","authenticated-orcid":false,"given":"Qiaosong","family":"Qi","sequence":"first","affiliation":[{"name":"Alibaba Group, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7895-091X","authenticated-orcid":false,"given":"Le","family":"Zhuo","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9863-0091","authenticated-orcid":false,"given":"Aixi","family":"Zhang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2671-0655","authenticated-orcid":false,"given":"Yue","family":"Liao","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6535-5553","authenticated-orcid":false,"given":"Fei","family":"Fang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9180-2935","authenticated-orcid":false,"given":"Si","family":"Liu","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8906-3777","authenticated-orcid":false,"given":"Shuicheng","family":"Yan","sequence":"additional","affiliation":[{"name":"BAAI &amp; Skyworks, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00724"},{"key":"e_1_3_2_1_2_1","volume-title":"Lucas Smaira, Sander Dieleman, and Andrew Zisserman.","author":"Alayrac Jean-Baptiste","year":"2020","unstructured":"Jean-Baptiste Alayrac, Adri\u00e0 Recasens, Rosalia Schneider, Relja Arandjelovic, Jason Ramapuram, Jeffrey De Fauw, Lucas Smaira, Sander Dieleman, and Andrew Zisserman. 2020. Self-Supervised MultiModal Versatile Networks. In NeurIPS."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.173"},{"key":"e_1_3_2_1_4_1","unstructured":"Kamil Deja Anna Kuzina Tomasz Trzcinski and Jakub M Tomczak. 2022. On Analyzing Generative and Denoising Capabilities of Diffusion-based Deep Generative Models. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_5_1","first-page":"8780","article-title":"Diffusion models beat gans on image synthesis","volume":"34","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. Advances in Neural Information Processing Systems 34 (2021), 8780--8794.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Yinglin Duan Tianyang Shi Zhipeng Hu Zhengxia Zou Changjie Fan Yi Yuan and Xi Li. 2021. Automatic Translation of Music-to-Dance for In-Game Characters. In IJCAI.","DOI":"10.24963\/ijcai.2021\/323"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cag.2020.09.009"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00723"},{"key":"e_1_3_2_1_9_1","volume-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_10_1","unstructured":"Jonathan Ho William Chan Chitwan Saharia Jay Whang Ruiqi Gao Alexey Gritsenko Diederik P Kingma Ben Poole Mohammad Norouzi David J Fleet et al. 2022. Imagen video: High definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)."},{"key":"e_1_3_2_1_11_1","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in Neural Information Processing Systems 33 (2020), 6840--6851.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_12_1","first-page":"47","article-title":"Cascaded Diffusion Models for High Fidelity Image Generation","volume":"23","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho, Chitwan Saharia, William Chan, David J Fleet, Mohammad Norouzi, and Tim Salimans. 2022. Cascaded Diffusion Models for High Fidelity Image Generation. J. Mach. Learn. Res. 23 (2022), 47--1.","journal-title":"J. Mach. Learn. Res."},{"key":"e_1_3_2_1_13_1","volume-title":"Classifier-Free Diffusion Guidance. In NeurIPS Workshop on Deep Generative Models and Downstream Applications.","author":"Ho Jonathan","year":"2021","unstructured":"Jonathan Ho and Tim Salimans. 2021. Classifier-Free Diffusion Guidance. In NeurIPS Workshop on Deep Generative Models and Downstream Applications."},{"key":"e_1_3_2_1_14_1","volume-title":"Dance Revolution: Long-Term Dance Generation with Music via Curriculum Learning. In ICLR.","author":"Huang Ruozi","year":"2021","unstructured":"Ruozi Huang, Huang Hu, Wei Wu, Kei Sawada, Mi Zhang, and Daxin Jiang. 2021. Dance Revolution: Long-Term Dance Generation with Music via Curriculum Learning. In ICLR."},{"key":"e_1_3_2_1_15_1","volume-title":"Jun Wang, Dan Su, Dong Yu, Yi Ren, and Zhou Zhao.","author":"Huang Rongjie","year":"2022","unstructured":"Rongjie Huang, Max WY Lam, Jun Wang, Dan Su, Dong Yu, Yi Ren, and Zhou Zhao. 2022. FastDiff: A Fast Conditional Diffusion Model for High-Quality Speech Synthesis. In IJCAI."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00348"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00530"},{"key":"e_1_3_2_1_18_1","unstructured":"Zhifeng Kong Wei Ping Jiaji Huang Kexin Zhao and Bryan Catanzaro. 2021. DiffWave: A Versatile Diffusion Model for Audio Synthesis. In ICLR."},{"key":"e_1_3_2_1_19_1","volume-title":"Listen to dance: Music-driven choreography generation using autoregressive encoder-decoder network. arXiv preprint arXiv:1811.00818","author":"Lee Juheon","year":"2018","unstructured":"Juheon Lee, Seohyun Kim, and Kyogu Lee. 2018. Listen to dance: Music-driven choreography generation using autoregressive encoder-decoder network. arXiv preprint arXiv:1811.00818 (2018)."},{"key":"e_1_3_2_1_20_1","volume-title":"Learning to Generate Diverse Dance Motions with Transformer. ArXiv abs\/2008.08171","author":"Li Jiaman","year":"2020","unstructured":"Jiaman Li, Yihang Yin, Hang Chu, Yi Zhou, Tingwu Wang, Sanja Fidler, and Hao Li. 2020. Learning to Generate Diverse Dance Motions with Transformer. ArXiv abs\/2008.08171 (2020)."},{"key":"e_1_3_2_1_21_1","unstructured":"Ruilong Li Shan Yang D A Ross and Angjoo Kanazawa. 2021. AI choreographer: Music conditioned 3d dance generation with aist. In ICCV."},{"key":"e_1_3_2_1_22_1","unstructured":"Xiang Lisa Li John Thickstun Ishaan Gulrajani Percy Liang and Tatsunori B Hashimoto. 2022. Diffusion-LM Improves Controllable Text Generation. (2022)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/2816795.2818013"},{"key":"e_1_3_2_1_24_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_25_1","volume-title":"Matt McVicar, Eric Battenberg, and Oriol Nieto.","author":"McFee Brian","year":"2015","unstructured":"Brian McFee, Colin Raffel, Dawen Liang, Daniel PW Ellis, Matt McVicar, Eric Battenberg, and Oriol Nieto. 2015. librosa: Audio and music signal analysis in python. In SciPy."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Meinard M\u00fcller Tido R\u00f6der and Michael Clausen. 2005. Efficient content-based retrieval of motion capture data. In SIGGRAPH. 677--685.","DOI":"10.1145\/1073204.1073247"},{"key":"e_1_3_2_1_27_1","volume-title":"GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. arXiv:2112.10741","author":"Nichol Alex","year":"2021","unstructured":"Alex Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob McGrew, Ilya Sutskever, and Mark Chen. 2021. GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. arXiv:2112.10741 (2021)."},{"key":"e_1_3_2_1_28_1","unstructured":"Kensuke Onuma Christos Faloutsos and Jessica K Hodgins. 2008. FMDistance: A Fast and Effective Distance Function for Motion Capture Data.. In Eurographics. 83--86."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01080"},{"key":"e_1_3_2_1_30_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_31_1","volume-title":"Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al.","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour, Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. 2022. Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding. arXiv preprint arXiv:2205.11487 (2022)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3407659"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00790"},{"key":"e_1_3_2_1_34_1","volume-title":"Chen Change Loy, and Ziwei Liu","author":"Siyao Li","year":"2022","unstructured":"Li Siyao, Weijiang Yu, Tianpei Gu, Chunze Lin, Quan Wang, Chen Qian, Chen Change Loy, and Ziwei Liu. 2022. Bailando: 3D dance generation via Actor-Critic GPT with Choreographic Memory. In CVPR."},{"key":"e_1_3_2_1_35_1","volume-title":"International Conference on Machine Learning. PMLR, 2256--2265","author":"Sohl-Dickstein Jascha","year":"2015","unstructured":"Jascha Sohl-Dickstein, Eric Weiss, Niru Maheswaranathan, and Surya Ganguli. 2015. Deep unsupervised learning using nonequilibrium thermodynamics. In International Conference on Machine Learning. PMLR, 2256--2265."},{"key":"e_1_3_2_1_36_1","volume-title":"Generative modeling by estimating gradients of the data distribution. Advances in Neural Information Processing Systems 32","author":"Song Yang","year":"2019","unstructured":"Yang Song and Stefano Ermon. 2019. Generative modeling by estimating gradients of the data distribution. Advances in Neural Information Processing Systems 32 (2019)."},{"key":"e_1_3_2_1_37_1","volume-title":"MotionCLIP: Exposing Human Motion Generation to CLIP Space. arXiv preprint arXiv:2203.08063","author":"Tevet Guy","year":"2022","unstructured":"Guy Tevet, Brian Gordon, Amir Hertz, Amit H Bermano, and Daniel Cohen-Or. 2022. MotionCLIP: Exposing Human Motion Generation to CLIP Space. arXiv preprint arXiv:2203.08063 (2022)."},{"key":"e_1_3_2_1_38_1","volume-title":"Human Motion Diffusion Model. arXiv preprint arXiv:2209.14916","author":"Tevet Guy","year":"2022","unstructured":"Guy Tevet, Sigal Raab, Brian Gordon, Yonatan Shafir, Amit H Bermano, and Daniel Cohen-Or. 2022. Human Motion Diffusion Model. arXiv preprint arXiv:2209.14916 (2022)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2019.8851872"},{"key":"e_1_3_2_1_41_1","volume-title":"LION: Latent Point Diffusion Models for 3D Shape Generation. In Advances in Neural Information Processing Systems.","author":"Zeng Xiaohui","year":"2022","unstructured":"Xiaohui Zeng, Arash Vahdat, Francis Williams, Zan Gojcic, Or Litany, Sanja Fidler, and Karsten Kreis. 2022. LION: Latent Point Diffusion Models for 3D Shape Generation. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_42_1","volume-title":"Motiondiffuse: Text-driven human motion generation with diffusion model. arXiv preprint arXiv:2208.15001","author":"Zhang Mingyuan","year":"2022","unstructured":"Mingyuan Zhang, Zhongang Cai, Liang Pan, Fangzhou Hong, Xinying Guo, Lei Yang, and Ziwei Liu. 2022. Motiondiffuse: Text-driven human motion generation with diffusion model. arXiv preprint arXiv:2208.15001 (2022)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00589"},{"key":"e_1_3_2_1_44_1","volume-title":"Music2Dance: DanceNet for Music-driven Dance Generation. arXiv: Computer Vision and Pattern Recognition","author":"Zhuang Wenlin","year":"2020","unstructured":"Wenlin Zhuang, CongyiWang, Si-Yu Xia, Jinxiang Chai, and YangangWang. 2020. Music2Dance: DanceNet for Music-driven Dance Generation. arXiv: Computer Vision and Pattern Recognition (2020)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612307","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612307","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:56:57Z","timestamp":1755820617000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612307"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":44,"alternative-id":["10.1145\/3581783.3612307","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612307","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}