{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,2]],"date-time":"2026-05-02T14:52:42Z","timestamp":1777733562071,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62376131"],"award-info":[{"award-number":["62376131"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755170","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:39:06Z","timestamp":1761377946000},"page":"9871-9880","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["FreeAudio: Training-Free Timing Planning for Controllable Long-Form Text-to-Audio Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-7134-7810","authenticated-orcid":false,"given":"Yuxuan","family":"Jiang","sequence":"first","affiliation":[{"name":"Department of Electronic Engineering, Tsinghua University, Beijing, China and Shengshu AI, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2294-7447","authenticated-orcid":false,"given":"Zehua","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China and Shengshu AI, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8573-8536","authenticated-orcid":false,"given":"Zeqian","family":"Ju","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7902-3759","authenticated-orcid":false,"given":"Chang","family":"Li","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China and Shengshu AI, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8555-2776","authenticated-orcid":false,"given":"Weibei","family":"Dou","sequence":"additional","affiliation":[{"name":"Department of Electronic Engineering, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6254-2388","authenticated-orcid":false,"given":"Jun","family":"Zhu","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China and Shengshu AI, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Musiclm: Generating music from text. arXiv preprint arXiv:2301.11325","author":"Agostinelli Andrea","year":"2023","unstructured":"Andrea Agostinelli, Timo I Denk, Zal\u00e1n Borsos, Jesse Engel, Mauro Verzetti, Antoine Caillon, Qingqing Huang, Aren Jansen, Adam Roberts, Marco Tagliasacchi, et al., 2023. Musiclm: Generating music from text. arXiv preprint arXiv:2301.11325 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Multidiffusion: Fusing diffusion paths for controlled image generation.","author":"Bar-Tal Omer","year":"2023","unstructured":"Omer Bar-Tal, Lior Yariv, Yaron Lipman, and Tali Dekel. 2023. Multidiffusion: Fusing diffusion paths for controlled image generation. (2023)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Zal\u00e1n Borsos Rapha\u00ebl Marinier Damien Vincent Eugene Kharitonov Olivier Pietquin Matt Sharifi Dominik Roblek Olivier Teboul David Grangier Marco Tagliasacchi et al. 2023. Audiolm: a language modeling approach to audio generation. IEEE\/ACM transactions on audio speech and language processing Vol. 31 (2023) 2523-2533.","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"e_1_3_2_1_4_1","volume-title":"A survey of deep learning audio generation methods. arXiv preprint arXiv:2406.00146","author":"Bo\u017ei\u0107 Matej","year":"2024","unstructured":"Matej Bo\u017ei\u0107 and Marko Horvat. 2024. A survey of deep learning audio generation methods. arXiv preprint arXiv:2406.00146 (2024)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"e_1_3_2_1_6_1","first-page":"1","article-title":"Scaling instruction-finetuned language models","volume":"25","author":"Chung Hyung Won","year":"2024","unstructured":"Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Yunxuan Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, et al., 2024. Scaling instruction-finetuned language models. Journal of Machine Learning Research, Vol. 25, 70 (2024), 1-53.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_7_1","first-page":"47704","article-title":"Simple and controllable music generation","volume":"36","author":"Copet Jade","year":"2023","unstructured":"Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi, and Alexandre D\u00e9fossez. 2023. Simple and controllable music generation. Advances in Neural Information Processing Systems, Vol. 36 (2023), 47704-47720.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_8_1","volume-title":"Latent Swap Joint Diffusion for Long-Form Audio Generation. arXiv preprint arXiv:2502.05130","author":"Dai Yusheng","year":"2025","unstructured":"Yusheng Dai, Chenxi Wang, Chang Li, Chen Wang, Jun Du, Kewei Li, Ruoyu Wang, Jiefeng Ma, Lei Sun, and Jianqing Gao. 2025. Latent Swap Joint Diffusion for Long-Form Audio Generation. arXiv preprint arXiv:2502.05130 (2025)."},{"key":"e_1_3_2_1_9_1","volume-title":"Controllable Text-to-Audio Generation with Training-Free Temporal Guidance Diffusion. In 2024 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1-6.","author":"Du Tianjiao","year":"2024","unstructured":"Tianjiao Du, Jun Chen, Jiasheng Lu, Qinmei Xu, Huan Liao, Yupeng Chen, and Zhiyong Wu. 2024. Controllable Text-to-Audio Generation with Training-Free Temporal Guidance Diffusion. In 2024 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1-6."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.31219\/osf.io\/m5eba"},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021)","author":"Ebbers Janek","year":"2021","unstructured":"Janek Ebbers and Reinhold Haeb-Umbach. 2021b. Self-trained audio tagging and sound event detection in domestic environments. In Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021)."},{"key":"e_1_3_2_1_12_1","unstructured":"Janek Ebbers and Reinhold Haeb-Umbach. 2022. Pre-training and self-training for sound event detection in domestic environments. (2022)."},{"key":"e_1_3_2_1_13_1","volume-title":"Forty-first International Conference on Machine Learning.","author":"Evans Zach","year":"2024","unstructured":"Zach Evans, CJ Carr, Josiah Taylor, Scott H Hawley, and Jordi Pons. 2024a. Fast timing-conditioned latent audio diffusion. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_2_1_14_1","volume-title":"Long-form music generation with latent diffusion. arXiv preprint arXiv:2404.10301","author":"Evans Zach","year":"2024","unstructured":"Zach Evans, Julian D Parker, CJ Carr, Zack Zukowski, Josiah Taylor, and Jordi Pons. 2024b. Long-form music generation with latent diffusion. arXiv preprint arXiv:2404.10301 (2024)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888461"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3133208"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612348"},{"key":"e_1_3_2_1_19_1","unstructured":"Daya Guo Dejian Yang Haowei Zhang Junxiao Song Ruoyu Zhang Runxin Xu Qihao Zhu Shirong Ma Peiyi Wang Xiao Bi et al. 2025. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:2501.12948 (2025)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29773"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414579"},{"key":"e_1_3_2_1_22_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems, Vol. 33 (2020), 6840-6851."},{"key":"e_1_3_2_1_23_1","first-page":"28708","article-title":"Masked autoencoders that listen","volume":"35","author":"Huang Po-Yao","year":"2022","unstructured":"Po-Yao Huang, Hu Xu, Juncheng Li, Alexei Baevski, Michael Auli, Wojciech Galuba, Florian Metze, and Christoph Feichtenhofer. 2022. Masked autoencoders that listen. Advances in Neural Information Processing Systems, Vol. 35 (2022), 28708-28720.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_24_1","volume-title":"International Conference on Machine Learning. PMLR, 13916-13932","author":"Huang Rongjie","year":"2023","unstructured":"Rongjie Huang, Jiawei Huang, Dongchao Yang, Yi Ren, Luping Liu, Mingze Li, Zhenhui Ye, Jinglin Liu, Xiang Yin, and Zhou Zhao. 2023. Make-an-audio: Text-to-audio generation with prompt-enhanced diffusion models. In International Conference on Machine Learning. PMLR, 13916-13932."},{"key":"e_1_3_2_1_25_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. Gpt-4o system card. arXiv preprint arXiv:2410.21276 (2024)."},{"key":"e_1_3_2_1_26_1","unstructured":"Chaeyoung Jung Hojoon Ki Ji-Hoon Kim and Joon Son Chung. [n.d.]. InfiniteAudio: Infinite-Length Audio Generation with Consistent Acoustic Attributes. ([n.d.])."},{"key":"e_1_3_2_1_27_1","unstructured":"Koray Kavukcuoglu. 2025. Gemini 2.5: Our most intelligent AI model. Google DeepMind. https:\/\/blog.google\/technology\/google-deepmind\/gemini-model-thinking-updates-march-2025\/"},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Kim Chris Dongjoo","year":"2019","unstructured":"Chris Dongjoo Kim, Byeongchang Kim, Hyunmin Lee, and Gunhee Kim. 2019. Audiocaps: Generating captions for audios in the wild. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers). 119-132."},{"key":"e_1_3_2_1_29_1","volume-title":"Audiogen: Textually guided audio generation. arXiv preprint arXiv:2209.15352","author":"Kreuk Felix","year":"2022","unstructured":"Felix Kreuk, Gabriel Synnaeve, Adam Polyak, Uriel Singer, Alexandre D\u00e9fossez, Jade Copet, Devi Parikh, Yaniv Taigman, and Yossi Adi. 2022. Audiogen: Textually guided audio generation. arXiv preprint arXiv:2209.15352 (2022)."},{"key":"e_1_3_2_1_30_1","volume-title":"ETTA: Elucidating the Design Space of Text-to-Audio Models. arXiv preprint arXiv:2412.19351","author":"Kong Zhifeng","year":"2024","unstructured":"Sang-gil Lee, Zhifeng Kong, Arushi Goel, Sungwon Kim, Rafael Valle, and Bryan Catanzaro. 2024. ETTA: Elucidating the Design Space of Text-to-Audio Models. arXiv preprint arXiv:2412.19351 (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"QA-MDT: Quality-aware Masked Diffusion Transformer for Enhanced Music Generation. arXiv preprint arXiv:2405.15863","author":"Li Chang","year":"2024","unstructured":"Chang Li, Ruoyu Wang, Lijuan Liu, Jun Du, Yixuan Sun, Zilu Guo, Zhenrong Zhang, Yuan Jiang, Jianqing Gao, and Feng Ma. 2024. QA-MDT: Quality-aware Masked Diffusion Transformer for Enhanced Music Generation. arXiv preprint arXiv:2405.15863 (2024)."},{"key":"e_1_3_2_1_32_1","volume-title":"Audioldm: Text-to-audio generation with latent diffusion models. arXiv preprint arXiv:2301.12503","author":"Liu Haohe","year":"2023","unstructured":"Haohe Liu, Zehua Chen, Yi Yuan, Xinhao Mei, Xubo Liu, Danilo Mandic, Wenwu Wang, and Mark D Plumbley. 2023. Audioldm: Text-to-audio generation with latent diffusion models. arXiv preprint arXiv:2301.12503 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"Audioldm 2: Learning holistic audio generation with self-supervised pretraining","author":"Liu Haohe","year":"2024","unstructured":"Haohe Liu, Yi Yuan, Xubo Liu, Xinhao Mei, Qiuqiang Kong, Qiao Tian, Yuping Wang, Wenwu Wang, Yuxuan Wang, and Mark D Plumbley. 2024. Audioldm 2: Learning holistic audio generation with self-supervised pretraining. IEEE\/ACM Transactions on Audio, Speech, and Language Processing (2024)."},{"key":"e_1_3_2_1_34_1","volume-title":"Wavcaps: A chatgpt-assisted weakly-labelled audio captioning dataset for audio-language multimodal research","author":"Mei Xinhao","year":"2024","unstructured":"Xinhao Mei, Chutong Meng, Haohe Liu, Qiuqiang Kong, Tom Ko, Chengqi Zhao, Mark D Plumbley, Yuexian Zou, and Wenwu Wang. 2024. Wavcaps: A chatgpt-assisted weakly-labelled audio captioning dataset for audio-language multimodal research. IEEE\/ACM Transactions on Audio, Speech, and Language Processing (2024)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.3390\/app6060162"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_2_1_37_1","volume-title":"Both Ears Wide Open: Towards Language-Driven Spatial Audio Generation. arXiv preprint arXiv:2410.10676","author":"Sun Peiwen","year":"2024","unstructured":"Peiwen Sun, Sitong Cheng, Xiangtai Li, Zhen Ye, Huadai Liu, Honggang Zhang, Wei Xue, and Yike Guo. 2024. Both Ears Wide Open: Towards Language-Driven Spatial Audio Generation. arXiv preprint arXiv:2410.10676 (2024)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.33682\/006b-jx26"},{"key":"e_1_3_2_1_39_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2025.3562946"},{"key":"e_1_3_2_1_41_1","volume-title":"Audio-agent: Leveraging llms for audio generation, editing and composition. arXiv preprint arXiv:2410.03335","author":"Wang Zixuan","year":"2024","unstructured":"Zixuan Wang, Chi-Keung Tang, and Yu-Wing Tai. 2024. Audio-agent: Leveraging llms for audio generation, editing and composition. arXiv preprint arXiv:2410.03335 (2024)."},{"key":"e_1_3_2_1_42_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al., 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, Vol. 35 (2022), 24824-24837."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"e_1_3_2_1_44_1","volume-title":"Picoaudio: Enabling precise timestamp and frequency controllability of audio events in text-to-audio generation. arXiv preprint arXiv:2407.02869","author":"Xie Zeyu","year":"2024","unstructured":"Zeyu Xie, Xuenan Xu, Zhizheng Wu, and Mengyue Wu. 2024. Picoaudio: Enabling precise timestamp and frequency controllability of audio events in text-to-audio generation. arXiv preprint arXiv:2407.02869 (2024)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10889879"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3443614"},{"key":"e_1_3_2_1_47_1","volume-title":"Uniaudio: An audio foundation model toward universal audio generation. arXiv preprint arXiv:2310.00704","author":"Yang Dongchao","year":"2023","unstructured":"Dongchao Yang, Jinchuan Tian, Xu Tan, Rongjie Huang, Songxiang Liu, Xuankai Chang, Jiatong Shi, Sheng Zhao, Jiang Bian, Xixin Wu, et al., 2023. Uniaudio: An audio foundation model toward universal audio generation. arXiv preprint arXiv:2310.00704 (2023)."},{"key":"e_1_3_2_1_48_1","volume-title":"Forty-first International Conference on Machine Learning.","author":"Yang Ling","year":"2024","unstructured":"Ling Yang, Zhaochen Yu, Chenlin Meng, Minkai Xu, Stefano Ermon, and Bin Cui. 2024. Mastering text-to-image diffusion: Recaptioning, planning, and generating with multimodal llms. In Forty-first International Conference on Machine Learning."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755170","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:58:26Z","timestamp":1765310306000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755170"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":48,"alternative-id":["10.1145\/3746027.3755170","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755170","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}