{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,29]],"date-time":"2025-05-29T04:01:45Z","timestamp":1748491305101,"version":"3.41.0"},"reference-count":26,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,4,6]]},"DOI":"10.1109\/icasspw65056.2025.11011064","type":"proceedings-article","created":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T17:05:14Z","timestamp":1748365514000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["Musimple: A Simplified Music Generation System with Diffusion Transformer"],"prefix":"10.1109","author":[{"given":"Zheqi","family":"Dai","sequence":"first","affiliation":[{"name":"The Chinese University of Hong Kong,Electronic Engineering"}]},{"given":"Haolin","family":"He","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Electronic Engineering"}]},{"given":"Qiuqiang","family":"Kong","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Electronic Engineering"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-70163-9"},{"article-title":"Music transformer","volume-title":"International Conference on Learning Representations","author":"Huang","key":"ref2"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/taslpro.2025.3530270"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i24.34761"},{"key":"ref5","article-title":"Simple and controllable music generation","volume":"36","author":"Copet","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"article-title":"Musiclm: Generating music from text","year":"2023","author":"Agostinelli","key":"ref6"},{"article-title":"Auto-encoding variational bayes","year":"2013","author":"Kingma","key":"ref7"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"article-title":"Mousai: Text-to-music generation with long-context latent diffusion","year":"2023","author":"Schneider","key":"ref9"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3399607"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"issue":"140","key":"ref12","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"Journal of machine learning research"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/icassp49660.2025.10888461"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"article-title":"Bigvgan: A universal neural vocoder with large-scale training","year":"2022","author":"Lee","key":"ref16"},{"key":"ref17","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref18","first-page":"8780","article-title":"Diffusion models beat gans on image synthesis","volume":"34","author":"Dhariwal","year":"2021","journal-title":"Advances in neural information processing systems"},{"key":"ref19","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"article-title":"The gtzan dataset: Its contents, its faults, their effects on evaluation, and its future use","year":"2013","author":"Sturm","key":"ref20"},{"article-title":"Adam: A method for stochastic optimization","year":"2014","author":"Kingma","key":"ref21"},{"key":"ref22","article-title":"Mean teachers are better role models: Weight-averaged consistency targets improve semi-supervised deep learning results","volume":"30","author":"Tarvainen","year":"2017","journal-title":"Advances in neural information processing systems"},{"article-title":"Fr\\\u2019echet audio distance: A metric for evaluating music enhancement algorithms","year":"2018","author":"Kilgour","key":"ref23"},{"article-title":"Notes on kullback-leibler divergence and likelihood","year":"2014","author":"Shlens","key":"ref24"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952132"},{"article-title":"Riffusion - Stable diffusion for real-time music generation","year":"2022","author":"Forsgren","key":"ref26"}],"event":{"name":"2025 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)","start":{"date-parts":[[2025,4,6]]},"location":"Hyderabad, India","end":{"date-parts":[[2025,4,11]]}},"container-title":["2025 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11010992\/11010997\/11011064.pdf?arnumber=11011064","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,28]],"date-time":"2025-05-28T04:51:26Z","timestamp":1748407886000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11011064\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,6]]},"references-count":26,"URL":"https:\/\/doi.org\/10.1109\/icasspw65056.2025.11011064","relation":{},"subject":[],"published":{"date-parts":[[2025,4,6]]}}}