{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T10:34:45Z","timestamp":1777631685435,"version":"3.51.4"},"reference-count":66,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Multimedia"],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/tmm.2025.3590912","type":"journal-article","created":{"date-parts":[[2025,7,23]],"date-time":"2025-07-23T18:44:14Z","timestamp":1753296254000},"page":"6857-6871","source":"Crossref","is-referenced-by-count":12,"title":["XMusic: Towards a Generalized and Controllable Symbolic Music Generation Framework"],"prefix":"10.1109","volume":"27","author":[{"given":"Sida","family":"Tian","sequence":"first","affiliation":[{"name":"Tencent, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9530-5218","authenticated-orcid":false,"given":"Can","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tencent, Beijing, China"}]},{"given":"Wei","family":"Yuan","sequence":"additional","affiliation":[{"name":"Tencent, Beijing, China"}]},{"given":"Wei","family":"Tan","sequence":"additional","affiliation":[{"name":"Tencent, Beijing, China"}]},{"given":"Wenjie","family":"Zhu","sequence":"additional","affiliation":[{"name":"Tencent, Beijing, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"ref2","article-title":"MusicLM: Generating music from text","author":"Agostinelli","year":"2023"},{"key":"ref3","article-title":"Riffusion - stable diffusion for real-time music generation","author":"Forsgren","year":"2022"},{"key":"ref4","first-page":"47704","article-title":"Simple and controllable music generation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Copet","year":"2023"},{"key":"ref5","article-title":"Noise2Music: Text-conditioned music generation with diffusion models","author":"Huang","year":"2023"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1285"},{"key":"ref8","article-title":"Music transformer: Generating music with long-term structure","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Huang","year":"2018"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413671"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i1.16091"},{"key":"ref11","first-page":"318","article-title":"EMOPIA: A multi-modal pop piano dataset for emotion recognition and emotion-based music generation","volume-title":"Proc. Int. Soc. Music Inf. Retrieval Conf.","author":"Hung","year":"2021"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3163543"},{"key":"ref13","article-title":"MuseNet","volume":"3","author":"Payne","year":"2019","journal-title":"OpenAI Blog"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-29956-8_17"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3276177"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747802"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_44"},{"key":"ref18","first-page":"3325","article-title":"Audeo: Audio generation for a silent performance video","volume":"33","author":"Su","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475195"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01433"},{"key":"ref21","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Ouyang","year":"2022"},{"key":"ref22","article-title":"GPT-4 technical report","year":"2023"},{"key":"ref23","article-title":"LaMDA: Language models for dialog applications","author":"Thoppilan","year":"2022"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00453"},{"key":"ref25","article-title":"Hierarchical text-conditional image generation with CLIP latents","author":"Ramesh","year":"2022"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.437"},{"key":"ref28","first-page":"324","article-title":"MidiNet: A convolutional generative adversarial network for symbolic-domain music generation","volume-title":"Proc. Int. Soc. Music Inf. Retrieval Conf.","author":"Yang","year":"2017"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11312"},{"key":"ref30","first-page":"1899","article-title":"Encoding musical style with transformer autoencoders","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Choi","year":"2020"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054554"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICTAI.2018.00123"},{"key":"ref33","first-page":"190","article-title":"Convolutional generative adversarial networks with binary neurons for polyphonic music generation","volume-title":"Proc. Int. Soc. Music Inf. Retrieval Conf.","author":"Dong","year":"2018"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3284996"},{"key":"ref35","first-page":"143","article-title":"Controllable deep melody generation via hierarchical music structure representation","volume-title":"Proc. Int. Soc. Music Inf. Retrieval Conf.","author":"Dai","year":"2021"},{"key":"ref36","first-page":"725","article-title":"StructureNet: Inducing structure in generated melodies","volume-title":"Proc. Int. Soc. Music Inf. Retrieval Conf.","author":"Medeot","year":"2018"},{"key":"ref37","article-title":"Modeling self-repetition in music generation using generative adversarial networks","volume-title":"Proc. Mach. Learn. Music Discov. Workshop","author":"Jhamtani","year":"2019"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2017.2737984"},{"key":"ref39","article-title":"Generating music with sentiment using transformer-GANs","author":"Neves","year":"2022"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3321495"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3161851"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3146002"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3424116"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3443664"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3572031"},{"key":"ref46","first-page":"29258","article-title":"How does it sound?","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Su","year":"2021"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_11"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref49","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2021"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"ref51","article-title":"PySceneDetect documentation","year":"2024"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201371"},{"key":"ref53","first-page":"293","article-title":"Vocano: A note transcription framework for singing voice in polyphonic music","volume-title":"Proc. Int. Soc. Music Inf. Retrieval Conf.","author":"Hsu","year":"2021"},{"key":"ref54","first-page":"570","article-title":"Multi-modal music emotion recognition: A new dataset, methodology and comparative analysis","volume-title":"Proc. 10th Int. Symp. Comput. Music Multidisciplinary Res.","author":"Panda","year":"2013"},{"key":"ref55","first-page":"384","article-title":"Learning to generate music with sentiment","volume-title":"Proc. 20th Int. Soc. Music Inf. Retrieval Conf.","author":"Ferreira","year":"2019"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_36"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2723009"},{"key":"ref58","first-page":"142","article-title":"The Jazz transformer on the front line: Exploring the shortcomings of AI-composed music through quantitative measures","volume-title":"Proc. Int. Soc. Music Inf. Retrieval Conf.","author":"Wu","year":"2020"},{"key":"ref59","article-title":"Pypianoroll: Open source python package for handling multitrack pianoroll","volume-title":"Proc. Int. Soc. Music Inf. Retrieval Conf. Late-Breaking Paper","author":"Dong","year":"2018"},{"key":"ref60","article-title":"MusPy: A toolkit for symbolic music generation","volume-title":"Proc. 21st Int. Soc. Music Inf. Retrieval Conf.","author":"Dong","year":"2020"},{"key":"ref61","article-title":"Exploring the efficacy of pre-trained checkpoints in text-to-music generation task","author":"Wu","year":"2022"},{"key":"ref62","first-page":"247","article-title":"Automated music generation for visual art through emotion","volume-title":"Proc. Int. Conf. Commun. China","author":"Tan","year":"2020"},{"key":"ref63","article-title":"Sparks of artificial general intelligence: Early experiments with GPT-4","author":"Bubeck","year":"2023"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/icme.2008.4607692"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.5220\/0006597705010506"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/840"}],"container-title":["IEEE Transactions on Multimedia"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6046\/10844992\/11091494.pdf?arnumber=11091494","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,30]],"date-time":"2025-09-30T13:33:47Z","timestamp":1759239227000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11091494\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":66,"URL":"https:\/\/doi.org\/10.1109\/tmm.2025.3590912","relation":{},"ISSN":["1520-9210","1941-0077"],"issn-type":[{"value":"1520-9210","type":"print"},{"value":"1941-0077","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]}}}