{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T06:20:05Z","timestamp":1778048405075,"version":"3.51.4"},"reference-count":50,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T00:00:00Z","timestamp":1778025600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T00:00:00Z","timestamp":1778025600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"DOI":"10.13039\/501100007530","name":"National Taiwan University of Science and Technology","doi-asserted-by":"crossref","id":[{"id":"10.13039\/501100007530","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2026,8]]},"DOI":"10.1007\/s00530-026-02311-z","type":"journal-article","created":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T06:02:11Z","timestamp":1778047331000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["DanceLDM: latent-based diffusion model for dance generation and editing conditioned on music and text prompt"],"prefix":"10.1007","volume":"32","author":[{"given":"Ming-Cong","family":"Su","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wei-Lun","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tse-Yu","family":"Pan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,5,6]]},"reference":[{"key":"2311_CR1","doi-asserted-by":"crossref","unstructured":"Zhu, W., Ma, X., Ro, D., Ci, H., Zhang, J., Shi, J., Gao, F., Tian, Q., Wang, Y.: Human motion generation: A survey. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)","DOI":"10.1109\/TPAMI.2023.3330935"},{"key":"2311_CR2","doi-asserted-by":"crossref","unstructured":"Tseng, J., Castellon, R., Liu, K.: Edge: Editable dance generation from music. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 448\u2013458 (2023)","DOI":"10.1109\/CVPR52729.2023.00051"},{"key":"2311_CR3","doi-asserted-by":"crossref","unstructured":"Gong, K., Lian, D., Chang, H., Guo, C., Jiang, Z., Zuo, X., Mi, M.B., Wang, X.: Tm2d: Bimodality driven 3d dance generation via music-text integration. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9942\u20139952 (2023)","DOI":"10.1109\/ICCV51070.2023.00912"},{"key":"2311_CR4","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"2311_CR5","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)"},{"issue":"11","key":"2311_CR6","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1145\/3422622","volume":"63","author":"I Goodfellow","year":"2020","unstructured":"Goodfellow, I., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., Courville, A., Bengio, Y.: Generative adversarial networks. Commun. ACM 63(11), 139\u2013144 (2020)","journal-title":"Commun. ACM"},{"key":"2311_CR7","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. Advances in neural information processing systems 30 (2017)"},{"key":"2311_CR8","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2311_CR9","doi-asserted-by":"crossref","unstructured":"Petrovich, M., Black, M.J., Varol, G.: Action-conditioned 3d human motion synthesis with transformer vae. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10985\u201310995 (2021)","DOI":"10.1109\/ICCV48922.2021.01080"},{"key":"2311_CR10","doi-asserted-by":"crossref","unstructured":"Guo, C., Zou, S., Zuo, X., Wang, S., Ji, W., Li, X., Cheng, L.: Generating diverse and natural 3d human motions from text. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5152\u20135161 (2022)","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"2311_CR11","unstructured":"Tevet, G., Raab, S., Gordon, B., Shafir, Y., Cohen-or, D., Bermano, A.H.: Human motion diffusion model. In: The Eleventh International Conference on Learning Representations (2023). https:\/\/openreview.net\/forum?id=SJ1kSyO2jwu"},{"key":"2311_CR12","doi-asserted-by":"crossref","unstructured":"Chen, X., Jiang, B., Liu, W., Huang, Z., Fu, B., Chen, T., Yu, G.: Executing your commands via motion diffusion in latent space. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18000\u201318010 (2023)","DOI":"10.1109\/CVPR52729.2023.01726"},{"key":"2311_CR13","first-page":"20067","volume":"36","author":"B Jiang","year":"2024","unstructured":"Jiang, B., Chen, X., Liu, W., Yu, J., Yu, G., Chen, T.: Motiongpt: human motion as a foreign language. Adv. Neural. Inf. Process. Syst. 36, 20067\u201320079 (2024)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2311_CR14","doi-asserted-by":"crossref","unstructured":"Li, R., Yang, S., Ross, D.A., Kanazawa, A.: Ai choreographer: Music conditioned 3d dance generation with aist++. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13401\u201313412 (2021)","DOI":"10.1109\/ICCV48922.2021.01315"},{"key":"2311_CR15","doi-asserted-by":"crossref","unstructured":"Siyao, L., Yu, W., Gu, T., Lin, C., Wang, Q., Qian, C., Loy, C.C., Liu, Z.: Bailando: 3d dance generation by actor-critic gpt with choreographic memory. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11050\u201311059 (2022)","DOI":"10.1109\/CVPR52688.2022.01077"},{"key":"2311_CR16","first-page":"14959","volume":"35","author":"Z Wang","year":"2022","unstructured":"Wang, Z., Chen, Y., Liu, T., Zhu, Y., Liang, W., Huang, S.: Humanise: language-conditioned human motion generation in 3d scenes. Adv. Neural. Inf. Process. Syst. 35, 14959\u201314971 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2311_CR17","doi-asserted-by":"crossref","unstructured":"Huang, S., Wang, Z., Li, P., Jia, B., Liu, T., Zhu, Y., Liang, W., Zhu, S.-C.: Diffusion-based generation, optimization, and planning in 3d scenes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16750\u201316761 (2023)","DOI":"10.1109\/CVPR52729.2023.01607"},{"key":"2311_CR18","doi-asserted-by":"crossref","unstructured":"Ghosh, A., Cheema, N., Oguz, C., Theobalt, C., Slusallek, P.: Synthesis of compositional animations from textual descriptions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1396\u20131406 (2021)","DOI":"10.1109\/ICCV48922.2021.00143"},{"key":"2311_CR19","doi-asserted-by":"crossref","unstructured":"Petrovich, M., Black, M.J., Varol, G.: Temos: Generating diverse human motions from textual descriptions. In: European Conference on Computer Vision, pp. 480\u2013497 (2022). Springer","DOI":"10.1007\/978-3-031-20047-2_28"},{"issue":"8","key":"2311_CR20","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I.: Language models are unsupervised multitask learners. OpenAI Blog 1(8), 9 (2019)","journal-title":"OpenAI Blog"},{"key":"2311_CR21","doi-asserted-by":"crossref","unstructured":"Zhang, J., Zhang, Y., Cun, X., Zhang, Y., Zhao, H., Lu, H., Shen, X., Shan, Y.: Generating human motion from textual descriptions with discrete representations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14730\u201314740 (2023)","DOI":"10.1109\/CVPR52729.2023.01415"},{"key":"2311_CR22","unstructured":"Van Den Oord, A., Vinyals, O., et al.: Neural discrete representation learning. Advances in neural information processing systems 30 (2017)"},{"issue":"140","key":"2311_CR23","first-page":"1","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., Zhou, Y., Li, W., Liu, P.J.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(140), 1\u201367 (2020)","journal-title":"J. Mach. Learn. Res."},{"issue":"70","key":"2311_CR24","first-page":"1","volume":"25","author":"HW Chung","year":"2024","unstructured":"Chung, H.W., Hou, L., Longpre, S., Zoph, B., Tay, Y., Fedus, W., Li, Y., Wang, X., Dehghani, M., Brahma, S.: Scaling instruction-finetuned language models. J. Mach. Learn. Res. 25(70), 1\u201353 (2024)","journal-title":"J. Mach. Learn. Res."},{"key":"2311_CR25","doi-asserted-by":"crossref","unstructured":"Zhang, M., Cai, Z., Pan, L., Hong, F., Guo, X., Yang, L., Liu, Z.: Motiondiffuse: Text-driven human motion generation with diffusion model. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)","DOI":"10.1109\/TPAMI.2024.3355414"},{"key":"2311_CR26","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., : Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763 (2021). PMLR"},{"issue":"4","key":"2311_CR27","first-page":"1","volume":"40","author":"K Chen","year":"2021","unstructured":"Chen, K., Tan, Z., Lei, J., Zhang, S.-H., Guo, Y.-C., Zhang, W., Hu, S.-M.: Choreomaster: choreography-oriented music-driven dance synthesis. ACM Trans. Graph. 40(4), 1\u201313 (2021)","journal-title":"ACM Trans. Graph."},{"key":"2311_CR28","doi-asserted-by":"crossref","unstructured":"Kim, J., Oh, H., Kim, S., Tong, H., Lee, S.: A brand new dance partner: music-conditioned pluralistic dancing controlled by multiple dance genres. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3490\u20133500 (2022)","DOI":"10.1109\/CVPR52688.2022.00348"},{"key":"2311_CR29","doi-asserted-by":"crossref","unstructured":"Kim, J., Kwon, B., Kim, J., Lee, S.: Mnet++: music-driven pluralistic dancing toward multiple dance genre synthesis. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)","DOI":"10.1109\/TPAMI.2023.3312092"},{"key":"2311_CR30","unstructured":"Dhariwal, P., Jun, H., Payne, C., Kim, J.W., Radford, A., Sutskever, I.: Jukebox: a generative model for music. arXiv preprint arXiv:2005.00341 (2020)"},{"key":"2311_CR31","doi-asserted-by":"crossref","unstructured":"Qi, Q., Zhuo, L., Zhang, A., Liao, Y., Fang, F., Liu, S., Yan, S.: Diffdance: cascaded human motion diffusion model for dance generation. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 1374\u20131382 (2023)","DOI":"10.1145\/3581783.3612307"},{"key":"2311_CR32","doi-asserted-by":"crossref","unstructured":"Li, R., Zhang, Y., Zhang, Y., Zhang, H., Guo, J., Zhang, Y., Liu, Y., Li, X.: Lodge: A coarse to fine diffusion network for long dance generation guided by the characteristic dance primitives. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1524\u20131534 (2024)","DOI":"10.1109\/CVPR52733.2024.00151"},{"key":"2311_CR33","doi-asserted-by":"crossref","unstructured":"Wu, H.-H., Seetharaman, P., Kumar, K., Bello, J.P.: Wav2clip: Learning robust audio representations from clip. In: ICASSP 2022\u20132022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4563\u20134567 (2022). IEEE","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"2311_CR34","unstructured":"Nichol, A.Q., Dhariwal, P.: Improved denoising diffusion probabilistic models. In: International Conference on Machine Learning, pp. 8162\u20138171 (2021). PMLR"},{"key":"2311_CR35","doi-asserted-by":"crossref","unstructured":"Blattmann, A., Rombach, R., Ling, H., Dockhorn, T., Kim, S.W., Fidler, S., Kreis, K.: Align your latents: high-resolution video synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22563\u201322575 (2023)","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"2311_CR36","unstructured":"Liu, Y., Zhang, K., Li, Y., Yan, Z., Gao, C., Chen, R., Yuan, Z., Huang, Y., Sun, H., Gao, J., et al.: Sora: a review on background, technology, limitations, and opportunities of large vision models. arXiv preprint arXiv:2402.17177 (2024)"},{"key":"2311_CR37","doi-asserted-by":"crossref","unstructured":"Chen, K., Wu, Y., Liu, H., Nezhurina, M., Berg-Kirkpatrick, T., Dubnov, S.: Musicldm: Enhancing novelty in text-to-music generation using beat-synchronous mixup strategies. In: ICASSP 2024\u20132024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1206\u20131210 (2024). IEEE","DOI":"10.1109\/ICASSP48485.2024.10447265"},{"key":"2311_CR38","doi-asserted-by":"crossref","unstructured":"Liu, H., Yuan, Y., Liu, X., Mei, X., Kong, Q., Tian, Q., Wang, Y., Wang, W., Wang, Y., Plumbley, M.D.: Audioldm 2: Learning holistic audio generation with self-supervised pretraining. Speech, and Language Processing, IEEE\/ACM Transactions on Audio (2024)","DOI":"10.1109\/TASLP.2024.3399607"},{"key":"2311_CR39","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., Chan, W., Saxena, S., Li, L., Whang, J., Denton, E.L., Ghasemipour, K., Gontijo Lopes, R., Karagol Ayan, B., Salimans, T.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2311_CR40","doi-asserted-by":"crossref","unstructured":"McFee, B., Raffel, C., Liang, D., Ellis, D.P., McVicar, M., Battenberg, E., Nieto, O.: librosa: Audio and music signal analysis in python. In: SciPy, pp. 18\u201324 (2015)","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"2311_CR41","doi-asserted-by":"crossref","unstructured":"Perez, E., Strub, F., De Vries, H., Dumoulin, V., Courville, A.: Film: Visual reasoning with a general conditioning layer. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 32 (2018)","DOI":"10.1609\/aaai.v32i1.11671"},{"key":"2311_CR42","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)"},{"key":"2311_CR43","doi-asserted-by":"crossref","unstructured":"Guo, C., Zuo, X., Wang, S., Cheng, L.: Tm2t: Stochastic and tokenized modeling for the reciprocal generation of 3d human motions and texts. In: European Conference on Computer Vision, pp. 580\u2013597 (2022). Springer","DOI":"10.1007\/978-3-031-19833-5_34"},{"key":"2311_CR44","doi-asserted-by":"crossref","unstructured":"Mahmood, N., Ghorbani, N., Troje, N.F., Pons-Moll, G., Black, M.J.: Amass: archive of motion capture as surface shapes. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5442\u20135451 (2019)","DOI":"10.1109\/ICCV.2019.00554"},{"key":"2311_CR45","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in Neural Information Processing Systems 30 (2017)"},{"key":"2311_CR46","unstructured":"Gopinath, D., Won, J.: Fairmotion-tools to load, process and visualize motion capture data. Github (2020)"},{"key":"2311_CR47","doi-asserted-by":"crossref","unstructured":"Xie, X., Zhou, P., Li, H., Lin, Z., Yan, S.: Adan: Adaptive nesterov momentum algorithm for faster optimizing deep models. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)","DOI":"10.1109\/TPAMI.2024.3423382"},{"key":"2311_CR48","doi-asserted-by":"publisher","first-page":"97","DOI":"10.1007\/s00591-010-0080-8","volume":"58","author":"F Klinker","year":"2011","unstructured":"Klinker, F.: Exponential moving average versus moving exponential average. Math. Semesterber. 58, 97\u2013107 (2011)","journal-title":"Math. Semesterber."},{"key":"2311_CR49","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.D.: Glove: Global vectors for word representation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1532\u20131543 (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"2311_CR50","unstructured":"Ling, Z., Han, B., Wong, Y., Lin, H., Kankanhalli, M., Geng, W.: Mcm: Multi-condition motion synthesis framework. In: Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence, IJCAI-24, pp. 1083\u20131091 (2024)"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-026-02311-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-026-02311-z","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-026-02311-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T06:02:54Z","timestamp":1778047374000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-026-02311-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,6]]},"references-count":50,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2026,8]]}},"alternative-id":["2311"],"URL":"https:\/\/doi.org\/10.1007\/s00530-026-02311-z","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,5,6]]},"assertion":[{"value":"7 September 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 February 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 May 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"274"}}