{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T19:52:57Z","timestamp":1776109977295,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":24,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,25]],"date-time":"2025-04-25T00:00:00Z","timestamp":1745539200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62272396,52275234"],"award-info":[{"award-number":["62272396,52275234"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Humanities and Social Sciences Foundation of the Ministry of Education of China","award":["22YJA760055"],"award-info":[{"award-number":["22YJA760055"]}]},{"name":"Science and Technology Innovation Program Project of Beijing Institute of Technology","award":["2024CX01023"],"award-info":[{"award-number":["2024CX01023"]}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["2024CX06123"],"award-info":[{"award-number":["2024CX06123"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,26]]},"DOI":"10.1145\/3706599.3719686","type":"proceedings-article","created":{"date-parts":[[2025,4,28]],"date-time":"2025-04-28T12:15:26Z","timestamp":1745842526000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Mozualization: Crafting Music and Visual Representation with Multimodal AI"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-3014-2901","authenticated-orcid":false,"given":"Wanfang","family":"Xu","sequence":"first","affiliation":[{"name":"School of Advanced Technology, Xi'an Jiaotong-Liverpool University, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6181-1673","authenticated-orcid":false,"given":"Lixiang","family":"Zhao","sequence":"additional","affiliation":[{"name":"School of Advanced Technology, Xi'an Jiaotong-Liverpool University, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4386-7024","authenticated-orcid":false,"given":"Haiwen","family":"Song","sequence":"additional","affiliation":[{"name":"Beijing Institude of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7930-4079","authenticated-orcid":false,"given":"Xinheng","family":"Song","sequence":"additional","affiliation":[{"name":"School of Design and Arts, Beijing Institute of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7775-5352","authenticated-orcid":false,"given":"Zhaolin","family":"Lu","sequence":"additional","affiliation":[{"name":"School of Design And Arts, Beijing Institute of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0226-1311","authenticated-orcid":false,"given":"Yu","family":"Liu","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong-Liverpool University, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3122-6788","authenticated-orcid":false,"given":"Min","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Advanced Technology, Xi'an Jiaotong-Liverpool University, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0199-7386","authenticated-orcid":false,"given":"Eng Gee","family":"Lim","sequence":"additional","affiliation":[{"name":"School of Advanced Technology, Xian jiaotong-Liverpool University, Suzhou, Jiangsu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3152-2587","authenticated-orcid":false,"given":"Lingyun","family":"Yu","sequence":"additional","affiliation":[{"name":"School of Advanced Technology, Xi'an Jiaotong-Liverpool University, Suzhou, Jiangsu, China"}]}],"member":"320","published-online":{"date-parts":[[2025,4,25]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","unstructured":"Andrea Agostinelli Timo\u00a0I. Denk Zal\u00e1n Borsos Jesse Engel Mauro Verzetti Antoine Caillon Qingqing Huang Aren Jansen Adam Roberts Marco Tagliasacchi Matt Sharifi Neil Zeghidour and Christian Frank. 2023. MusicLM: Generating Music From Text. 10.48550\/arXiv.2301.11325 arxiv:https:\/\/arXiv.org\/abs\/2301.11325","DOI":"10.48550\/arXiv.2301.11325"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","unstructured":"Ye Bai Haonan Chen Jitong Chen et\u00a0al. 2024. Seed-music: A unified framework for high quality and controlled music generation. 10.48550\/arXiv.2409.09214 arxiv:https:\/\/arXiv.org\/abs\/2409.09214","DOI":"10.48550\/arXiv.2409.09214"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","unstructured":"Mattia Bellini. 2022. Interactive digital narratives as complex expressive means. Frontiers in Virtual Reality 3 (2022) 854960. 10.3389\/frvir.2022.854960","DOI":"10.3389\/frvir.2022.854960"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","unstructured":"Jean-Pierre Briot Ga\u00ebtan Hadjeres and Fran\u00e7ois-David Pachet. 2019. Deep Learning Techniques for Music Generation \u2013 A Survey. 10.48550\/arXiv.1709.01620 arxiv:https:\/\/arXiv.org\/abs\/1709.01620","DOI":"10.48550\/arXiv.1709.01620"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02533"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2306.05284"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","unstructured":"S. Davis and P. Mermelstein. 1980. Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences. IEEE Transactions on Acoustics Speech and Signal Processing 28 4 (1980) 357\u2013366. 10.1109\/TASSP.1980.1163420","DOI":"10.1109\/TASSP.1980.1163420"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11312"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","unstructured":"Matthew Duignan. 2008. Computer Mediated Music Production: a Study of Abstraction and Activity. Ph.\u00a0D. Dissertation. Open Access Te Herenga Waka-Victoria University of Wellington. 10.26686\/wgtn.16945879.v1","DOI":"10.26686\/wgtn.16945879.v1"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","unstructured":"ROSINA FILIMON. 2023. Syncretism and synaesthesia in music \u2013 unification of arts and perceptions. ARTES. JOURNAL OF MUSICOLOGY 27 (07 2023) 167\u2013184. 10.35218\/ajm-2023-0010","DOI":"10.35218\/ajm-2023-0010"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","unstructured":"J Julia P\u00a0D Iswara and T Supriyadi. 2018. Song creation by using computer music notation. IOP Conference Series: Materials Science and Engineering 434 1 (nov 2018) 012055. 10.1088\/1757-899X\/434\/1\/012055","DOI":"10.1088\/1757-899X\/434\/1\/012055"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","unstructured":"Jaeyong Kang Soujanya Poria and Dorien Herremans. 2024. Video2Music: Suitable music generation from videos using an Affective Multimodal Transformer model. Expert Syst. Appl. 249 PC (Sept. 2024) 17\u00a0pages. 10.1016\/j.eswa.2024.123640","DOI":"10.1016\/j.eswa.2024.123640"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2305.15719"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","unstructured":"Junnan Li Dongxu Li Caiming Xiong and Steven Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. 10.48550\/arXiv.2201.12086 arxiv:https:\/\/arXiv.org\/abs\/2201.12086","DOI":"10.48550\/arXiv.2201.12086"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","unstructured":"Julius\u00a0O. Smith. 2004. Virtual Acoustic Musical Instruments: Review and Update. Journal of New Music Research 33 3 (2004) 283\u2013304. 10.1080\/0929821042000317859 arXiv:10.1080\/0929821042000317859","DOI":"10.1080\/0929821042000317859"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","unstructured":"Cihan Tabak. 2023. INTELLIGENT MUSIC APPLICATIONS: INNOVATIVE SOLUTIONS FOR MUSICIANS AND LISTENERS. Uluslararas\u0131 Anadolu Sosyal Bilimler Dergisi 7 3 (2023) 752\u2013773. 10.47525\/ulasbid.1324070","DOI":"10.47525\/ulasbid.1324070"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.2307\/3679551"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","unstructured":"V\u00edctor Villapalos. 2024. Inteligencia artificial al alcance de todos: plataformas y proyectos destacados. CLIP de SEDIC: Revista de la Sociedad Espa\u00f1ola de Documentaci\u00f3n e Informaci\u00f3n Cient\u00edfica 89 2 (06 2024) 14\u201326. 10.47251\/clip.n89.143","DOI":"10.47251\/clip.n89.143"},{"key":"e_1_3_3_2_21_2","volume-title":"VLDB Workshops","author":"Wang Mei","year":"2024","unstructured":"Mei Wang, Hai-Ning Liang, Yu Liu, Chengtao Ji, and Lingyun Yu. 2024. Tangible Progress: Employing Visual Metaphors and Physical Interfaces in AI-based English Language Learning. In VLDB Workshops. VLDB Endowment, New York, USA, 7\u00a0pages. https:\/\/api.semanticscholar.org\/CorpusID:273878571"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","unstructured":"Jamie Ward Daisy Thompson-Lake Roxanne Ely and Flora Kaminski. 2008. Synaesthesia creativity and art: What is the link? British journal of psychology (London England : 1953) 99 (03 2008) 127\u201341. 10.1348\/000712607X204164","DOI":"10.1348\/000712607X204164"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","unstructured":"Li-Chia Yang Szu-Yu Chou and Yi-Hsuan Yang. 2017. MidiNet: A Convolutional Generative Adversarial Network for Symbolic-Domain Music Generation. CoRR abs\/1703.10847 (2017) 8\u00a0pages. 10.48550\/arXiv.1703.10847","DOI":"10.48550\/arXiv.1703.10847"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","unstructured":"Tiancheng Yang and Shah Nazir. 2021. A comprehensive overview of AI-enabled music classification and its influence in games. Soft Computing 26 (2021) 7679 \u2013 7693. 10.1007\/s00500-022-06734-4","DOI":"10.1007\/s00500-022-06734-4"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","unstructured":"Yixiao Zhang Yukara Ikemiya Gus Xia Naoki Murata Marco\u00a0A. Mart\u00ednez-Ram\u00edrez Wei-Hsiang Liao Yuki Mitsufuji and Simon Dixon. 2024. MusicMagus: Zero-Shot Text-to-Music Editing via Diffusion Models. 10.48550\/arXiv.2402.06178 arxiv:https:\/\/arXiv.org\/abs\/2402.06178","DOI":"10.48550\/arXiv.2402.06178"}],"event":{"name":"CHI EA '25: Extended Abstracts of the CHI Conference on Human Factors in Computing Systems","location":"Yokohama Japan","acronym":"CHI EA '25","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the Extended Abstracts of the CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706599.3719686","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3706599.3719686","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:43Z","timestamp":1750295923000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706599.3719686"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,25]]},"references-count":24,"alternative-id":["10.1145\/3706599.3719686","10.1145\/3706599"],"URL":"https:\/\/doi.org\/10.1145\/3706599.3719686","relation":{},"subject":[],"published":{"date-parts":[[2025,4,25]]},"assertion":[{"value":"2025-04-25","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}