{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,2]],"date-time":"2025-06-02T18:40:09Z","timestamp":1748889609617,"version":"3.41.0"},"reference-count":66,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/access.2025.3572954","type":"journal-article","created":{"date-parts":[[2025,5,23]],"date-time":"2025-05-23T17:05:40Z","timestamp":1748019940000},"page":"92641-92662","source":"Crossref","is-referenced-by-count":0,"title":["VT2Music: A Multimodal Framework for Text-Visual Guided Music Generation and Comprehensive Performance Analysis"],"prefix":"10.1109","volume":"13","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-1407-8894","authenticated-orcid":false,"given":"Jiaxiang","family":"Zheng","sequence":"first","affiliation":[{"name":"Department of Global Cultural Convergence, Graduate School, Kangwon National University, Chuncheon, Gangwon-do, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2769-2316","authenticated-orcid":false,"given":"Moxi","family":"Cao","sequence":"additional","affiliation":[{"name":"Department of Global Cultural Convergence, Graduate School, Kangwon National University, Chuncheon, Gangwon-do, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5214-9840","authenticated-orcid":false,"given":"Chongbin","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Modern Music and Technology, Nanjing University of the Arts, Nanjing, Jiangsu, China"}]}],"member":"263","reference":[{"key":"ref1","first-page":"1376","article-title":"Museformer: Transformer with fine-and coarse-grained attention for music generation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Yu"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413671"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.459"},{"key":"ref4","first-page":"47704","article-title":"Simple and controllable music generation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Copet"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/cai59869.2024.00146"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01433"},{"key":"ref7","first-page":"12360","article-title":"Denoising diffusion probabilistic models","volume-title":"Proc. 33rd Int. Conf. Neural Inf. Process. Syst.","author":"Ho"},{"key":"ref8","first-page":"41","article-title":"Cascaded diffusion models for high fidelity image generation","volume":"23","author":"Ho","year":"2021","journal-title":"J. Mach. Learn. Res."},{"key":"ref9","first-page":"8599","article-title":"Grad-TTS: A diffusion probabilistic model for text-to-speech","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Va"},{"key":"ref10","article-title":"Imagen video: High definition video generation with diffusion models","author":"Ho","year":"2022","journal-title":"arXiv:2210.02303"},{"key":"ref11","article-title":"Make-an-audio: Text-to-audio generation with promptenhanced diffusion models","author":"Huang","year":"2023","journal-title":"arXiv:2301.12661"},{"key":"ref12","article-title":"Msanii: High fidelity music synthesis on a shoestring budget","author":"Maina","year":"2023","journal-title":"arXiv:2301.06468"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2017.2765202"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11312"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10379"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"ref19","article-title":"MusicLM: Generating music from text","author":"Agostinelli","year":"2023","journal-title":"arXiv:2301.11325"},{"key":"ref20","article-title":"MuLan: A joint embedding of music audio and natural language","author":"Huang","year":"2022","journal-title":"arXiv:2208.12415"},{"key":"ref21","article-title":"Scaling instruction-finetuned language models","author":"Chung","year":"2022","journal-title":"arXiv:2210.11416"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"ref23","article-title":"AudioLDM: Text-to-audio generation with latent diffusion models","author":"Liu","year":"2023","journal-title":"arXiv:2301.12503"},{"key":"ref24","article-title":"Noise2Music: Text-conditioned music generation with diffusion models","author":"Huang","year":"2023","journal-title":"arXiv:2302.03917"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447265"},{"key":"ref26","article-title":"Multitrack MusicLDM: Towards versatile music generation with latent diffusion model","author":"Karchkhadze","year":"2024","journal-title":"arXiv:2409.02845"},{"key":"ref27","article-title":"Mo\u00fbsai: Text-to-music generation with long-context latent diffusion","author":"Schneider","year":"2023","journal-title":"arXiv:2301.11757"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"ref30","first-page":"12652","article-title":"Fast timingconditioned latent audio diffusion","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","volume":"235","author":"Evans"},{"key":"ref31","article-title":"LAION-400M: Open dataset of CLIP-filtered 400 million image-text pairs","author":"Schuhmann","year":"2021","journal-title":"arXiv:2111.02114"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3463257"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28299"},{"key":"ref35","article-title":"M2UGen: Multi-modal music understanding and generation with the power of large language models","author":"Liu","year":"2023","journal-title":"arXiv:2311.11255"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1186\/s13636-024-00370-6"},{"key":"ref37","article-title":"Content-based video-music retrieval using soft intra-modal structure constraint","author":"Hong","year":"2017","journal-title":"arXiv:1704.06761"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01315"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_11"},{"key":"ref40","article-title":"Jukebox: A generative model for music","author":"Dhariwal","year":"2020","journal-title":"arXiv:2005.00341"},{"key":"ref41","first-page":"1","article-title":"Music transformer: Generating music with long-term structure","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Huang"},{"key":"ref42","article-title":"Multi-instrumentalist net: Unsupervised generation of music from body movements","author":"Su","year":"2020","journal-title":"arXiv:2012.03478"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_44"},{"key":"ref44","first-page":"29258","article-title":"How does it sound? Generation of rhythmic soundtracks for human movement videos","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","volume":"34","author":"Su"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475195"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2010.11929"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"ref48","article-title":"Fr\u00e9chet audio distance: A metric for evaluating music enhancement algorithms","author":"Kilgour","year":"2018","journal-title":"arXiv:1812.08466"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2016"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.3389\/fpsyg.2018.02489"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1037\/pmu0000032"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1177\/0305735613482023"},{"issue":"3","key":"ref55","first-page":"159","article-title":"Emotions represented and induced by music: The role of individual differences","volume":"35","author":"Vuoskoski","year":"2012","journal-title":"Behav. Brain Sci."},{"key":"ref56","first-page":"3","article-title":"On cultural, textual and experiential aspects of music mood","volume-title":"Proc. Int. Soc. Music Inf. Retr. Conf.","author":"Singhi"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1111\/j.1540-594X.2007.00272.x"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1177\/0305735691192004"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.2190\/A704-5647-5245-R47P"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1177\/1029864911403367"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1145\/2808196.2811643"},{"key":"ref62","article-title":"An empirical approach to the relationship between emotion and music production quality","author":"Ronan","year":"2018","journal-title":"arXiv:1803.11154"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.2478\/amns.2023.1.00386"},{"key":"ref64","first-page":"362","article-title":"On the relationships between music-induced emotion and physiological signals","volume-title":"Proc. Int. Soc. Music Inf. Retr. Conf.","author":"Hu"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/T-AFFC.2011.15"},{"issue":"1","key":"ref66","first-page":"1","article-title":"Emotion-aware music recommendation using physiological signals","volume":"5","author":"Yang","year":"2015","journal-title":"ACM Trans. Interact. Intell. Syst. (TiiS)"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6287639\/10820123\/11014098.pdf?arnumber=11014098","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,2]],"date-time":"2025-06-02T18:03:50Z","timestamp":1748887430000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11014098\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":66,"URL":"https:\/\/doi.org\/10.1109\/access.2025.3572954","relation":{},"ISSN":["2169-3536"],"issn-type":[{"type":"electronic","value":"2169-3536"}],"subject":[],"published":{"date-parts":[[2025]]}}}