{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:59:51Z","timestamp":1765342791162,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","funder":[{"name":"the National Key Research and Development Program of China","award":["2023YFF0904900"],"award-info":[{"award-number":["2023YFF0904900"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755656","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:27:39Z","timestamp":1761377259000},"page":"10427-10436","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Controllable Video-to-Music Generation with Multiple Time-Varying Conditions"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-8447-2121","authenticated-orcid":false,"given":"Junxian","family":"Wu","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9625-5547","authenticated-orcid":false,"given":"Weitao","family":"You","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7999-2317","authenticated-orcid":false,"given":"Heda","family":"Zuo","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6307-7692","authenticated-orcid":false,"given":"Dengming","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0962-6459","authenticated-orcid":false,"given":"Pei","family":"Chen","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5561-0493","authenticated-orcid":false,"given":"Lingyun","family":"Sun","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Musiclm: Generating music from text. arXiv preprint arXiv:2301.11325","author":"Agostinelli Andrea","year":"2023","unstructured":"Andrea Agostinelli, Timo I Denk, Zal\u00e1n Borsos, Jesse Engel, Mauro Verzetti, Antoine Caillon, Qingqing Huang, Aren Jansen, Adam Roberts, Marco Tagliasacchi, et al., 2023. Musiclm: Generating music from text. arXiv preprint arXiv:2301.11325 (2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2973795"},{"key":"e_1_3_2_1_3_1","first-page":"255","article-title":"Joint Beat and Downbeat Tracking with Recurrent Neural Networks","author":"B\u00f6ck Sebastian","year":"2016","unstructured":"Sebastian B\u00f6ck, Florian Krebs, and Gerhard Widmer. 2016b. Joint Beat and Downbeat Tracking with Recurrent Neural Networks.. In ISMIR. New York City, 255-261.","journal-title":"ISMIR. New York City"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095653"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02533"},{"key":"e_1_3_2_1_6_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Copet Jade","year":"2024","unstructured":"Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi, and Alexandre D\u00e9fossez. 2024. Simple and controllable music generation. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682475"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1177\/20592043221117651"},{"key":"e_1_3_2_1_9_1","unstructured":"Matthew EP Davies Norberto Degara and Mark D Plumbley. 2009. Evaluation methods for musical audio beat tracking algorithms. Queen Mary University of London Centre for Digital Music Tech. Rep. C4DM-TR-09-06 (2009)."},{"key":"e_1_3_2_1_10_1","volume-title":"High fidelity neural audio compression. arXiv preprint arXiv:2210.13438","author":"D\u00e9fossez Alexandre","year":"2022","unstructured":"Alexandre D\u00e9fossez, Jade Copet, Gabriel Synnaeve, and Yossi Adi. 2022. High fidelity neural audio compression. arXiv preprint arXiv:2210.13438 (2022)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475195"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.5555\/3692070.3692575"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_44"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446663"},{"key":"e_1_3_2_1_16_1","volume-title":"A Comprehensive Survey on Generative AI for Video-to-Music Generation. arXiv preprint arXiv:2502.12489","author":"Ji Shulei","year":"2025","unstructured":"Shulei Ji, Songruoyao Wu, Zihao Wang, Shuyu Li, and Kejun Zhang. 2025. A Comprehensive Survey on Generative AI for Video-to-Music Generation. arXiv preprint arXiv:2502.12489 (2025)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2024.123640"},{"key":"e_1_3_2_1_18_1","volume-title":"Fr'echet audio distance: A metric for evaluating music enhancement algorithms. arXiv preprint arXiv:1812.08466","author":"Kilgour Kevin","year":"2018","unstructured":"Kevin Kilgour, Mauricio Zuluaga, Dominik Roblek, and Matthew Sharifi. 2018. Fr'echet audio distance: A metric for evaluating music enhancement algorithms. arXiv preprint arXiv:1812.08466 (2018)."},{"key":"e_1_3_2_1_19_1","volume-title":"Musicongen: Rhythm and chord control for transformer-based text-to-music generation. arXiv preprint arXiv:2407.15060","author":"Lan Yun-Han","year":"2024","unstructured":"Yun-Han Lan, Wen-Yi Hsiao, Hao-Chung Cheng, and Yi-Hsuan Yang. 2024. Musicongen: Rhythm and chord control for transformer-based text-to-music generation. arXiv preprint arXiv:2407.15060 (2024)."},{"key":"e_1_3_2_1_20_1","volume-title":"TVNet: A Novel Time Series Analysis Method Based on Dynamic Convolution and 3D-Variation. arXiv preprint arXiv:2503.07674","author":"Li Chenghan","year":"2025","unstructured":"Chenghan Li, Mingchen Li, and Ruisheng Diao. 2025. TVNet: A Novel Time Series Analysis Method Based on Dynamic Convolution and 3D-Variation. arXiv preprint arXiv:2503.07674 (2025)."},{"key":"e_1_3_2_1_21_1","volume-title":"MuVi: Video-to-Music Generation with Semantic Alignment and Rhythmic Synchronization. arXiv preprint arXiv:2410.12957","author":"Li Ruiqi","year":"2024","unstructured":"Ruiqi Li, Siqi Zheng, Xize Cheng, Ziang Zhang, Shengpeng Ji, and Zhou Zhao. 2024c. MuVi: Video-to-Music Generation with Semantic Alignment and Rhythmic Synchronization. arXiv preprint arXiv:2410.12957 (2024)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02582"},{"key":"e_1_3_2_1_23_1","volume-title":"VidMusician: Video-to-Music Generation with Semantic-Rhythmic Alignment via Hierarchical Visual Features. arXiv preprint arXiv:2412.06296","author":"Li Sifei","year":"2024","unstructured":"Sifei Li, Binxin Yang, Chunji Yin, Chong Sun, Yuxin Zhang, Weiming Dong, and Chen Li. 2024b. VidMusician: Video-to-Music Generation with Semantic-Rhythmic Alignment via Hierarchical Visual Features. arXiv preprint arXiv:2412.06296 (2024)."},{"key":"e_1_3_2_1_24_1","volume-title":"Drawlody: Sketch-Based Melody Creation with Enhanced Usability and Interpretability","author":"Liang Qihao","year":"2024","unstructured":"Qihao Liang and Ye Wang. 2024. Drawlody: Sketch-Based Melody Creation with Enhanced Usability and Interpretability. IEEE Transactions on Multimedia (2024)."},{"key":"e_1_3_2_1_25_1","volume-title":"VMAS: Video-to-Music Generation via Semantic Alignment in Web Music Videos. arXiv preprint arXiv:2409.07450","author":"Lin Yan-Bo","year":"2024","unstructured":"Yan-Bo Lin, Yu Tian, Linjie Yang, Gedas Bertasius, and Heng Wang. 2024. VMAS: Video-to-Music Generation via Semantic Alignment in Web Music Videos. arXiv preprint arXiv:2409.07450 (2024)."},{"key":"e_1_3_2_1_26_1","volume-title":"Chenshuo Sun, and Ying Shan.","author":"Liu Shansong","year":"2023","unstructured":"Shansong Liu, Atin Sakkeer Hussain, Chenshuo Sun, and Ying Shan. 2023. M^2UGen: Multi-modal Music Understanding and Generation with the Power of Large Language Models. arXiv preprint arXiv:2311.11255 (2023)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.3389\/fpubh.2022.992200"},{"key":"e_1_3_2_1_28_1","volume-title":"Mustango: Toward controllable text-to-music generation. arXiv preprint arXiv:2311.08355","author":"Melechovsky Jan","year":"2023","unstructured":"Jan Melechovsky, Zixun Guo, Deepanway Ghosal, Navonil Majumder, Dorien Herremans, and Soujanya Poria. 2023. Mustango: Toward controllable text-to-music generation. arXiv preprint arXiv:2311.08355 (2023)."},{"key":"e_1_3_2_1_29_1","volume-title":"International conference on machine learning. PMLR, 7176-7185","author":"Naeem Muhammad Ferjad","year":"2020","unstructured":"Muhammad Ferjad Naeem, Seong Joon Oh, Youngjung Uh, Yunjey Choi, and Jaejun Yoo. 2020. Reliable fidelity and diversity metrics for generative models. In International conference on machine learning. PMLR, 7176-7185."},{"key":"e_1_3_2_1_30_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_31_1","first-page":"2014","article-title":"MIR_EVAL: A Transparent Implementation of Common MIR Metrics","volume":"10","author":"Raffel Colin","year":"2014","unstructured":"Colin Raffel, Brian McFee, Eric J Humphrey, Justin Salamon, Oriol Nieto, Dawen Liang, Daniel PW Ellis, and C Colin Raffel. 2014. MIR_EVAL: A Transparent Implementation of Common MIR Metrics.. In ISMIR, Vol. 10. 2014.","journal-title":"ISMIR"},{"key":"e_1_3_2_1_32_1","volume-title":"Audio Conditioning for Music Generation via Discrete Bottleneck Features. arXiv preprint arXiv:2407.12563","author":"Rouard Simon","year":"2024","unstructured":"Simon Rouard, Yossi Adi, Jade Copet, Axel Roebel, and Alexandre D\u00e9fossez. 2024. Audio Conditioning for Music Generation via Discrete Bottleneck Features. arXiv preprint arXiv:2407.12563 (2024)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096956"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1037\/h0077714"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28299"},{"key":"e_1_3_2_1_36_1","volume-title":"Multi-instrumentalist net: Unsupervised generation of music from body movements. arXiv preprint arXiv:2012.03478","author":"Su Kun","year":"2020","unstructured":"Kun Su, Xiulong Liu, and Eli Shlizerman. 2020. Multi-instrumentalist net: Unsupervised generation of music from body movements. arXiv preprint arXiv:2012.03478 (2020)."},{"key":"e_1_3_2_1_37_1","volume-title":"VidMuse: A Simple Video-to-Music Generation Framework with Long-Short-Term Modeling. arXiv preprint arXiv:2406.04321","author":"Tian Zeyue","year":"2024","unstructured":"Zeyue Tian, Zhaoyang Liu, Ruibin Yuan, Jiahao Pan, Xiaoqiang Huang, Qifeng Liu, Xu Tan, Qifeng Chen, Wei Xue, and Yike Guo. 2024. VidMuse: A Simple Video-to-Music Generation Framework with Long-Short-Term Modeling. arXiv preprint arXiv:2406.04321 (2024)."},{"key":"e_1_3_2_1_38_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Pauli Virtanen Ralf Gommers Travis E Oliphant Matt Haberland Tyler Reddy David Cournapeau Evgeni Burovski Pearu Peterson Warren Weckesser Jonathan Bright et al. 2020. SciPy 1.0: fundamental algorithms for scientific computing in Python. Nature methods Vol. 17 3 (2020) 261-272.","DOI":"10.1038\/s41592-020-0772-5"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"e_1_3_2_1_41_1","volume-title":"Continuous Emotion-Based Image-to-Music Generation","author":"Wang Yajie","year":"2023","unstructured":"Yajie Wang, Mulin Chen, and Xuelong Li. 2023a. Continuous Emotion-Based Image-to-Music Generation. IEEE Transactions on Multimedia (2023)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3399026"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3270726"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i2.32155"},{"key":"e_1_3_2_1_46_1","volume-title":"SONIQUE: Video Background Music Generation Using Unpaired Audio-Visual Data. In ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1-5.","author":"Zhang Liqian","year":"2025","unstructured":"Liqian Zhang and Magdalena Fuentes. 2025. SONIQUE: Video Background Music Generation Using Unpaired Audio-Visual Data. In ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1-5."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_2_1_48_1","first-page":"11127","article-title":"Uni-controlnet: All-in-one control to text-to-image diffusion models","volume":"36","author":"Zhao Shihao","year":"2023","unstructured":"Shihao Zhao, Dongdong Chen, Yen-Chun Chen, Jianmin Bao, Shaozhe Hao, Lu Yuan, and Kwan-Yee K Wong. 2023. Uni-controlnet: All-in-one control to text-to-image diffusion models. Advances in Neural Information Processing Systems, Vol. 36 (2023), 11127-11150.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_11"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01433"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i21.34474"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755656","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:57:21Z","timestamp":1765342641000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755656"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":51,"alternative-id":["10.1145\/3746027.3755656","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755656","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}