{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T20:12:48Z","timestamp":1774642368953,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key R&D Program of China under Grant"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681072","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"7008-7017","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["AudioLCM: Efficient and High-Quality Text-to-Audio Generation with Minimal Inference Steps"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-5782-5641","authenticated-orcid":false,"given":"Huadai","family":"Liu","sequence":"first","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1695-9000","authenticated-orcid":false,"given":"Rongjie","family":"Huang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8246-0741","authenticated-orcid":false,"given":"Yang","family":"Liu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3336-2712","authenticated-orcid":false,"given":"Hengyuan","family":"Cao","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2786-3896","authenticated-orcid":false,"given":"Jialei","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9708-3225","authenticated-orcid":false,"given":"Xize","family":"Cheng","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6201-1864","authenticated-orcid":false,"given":"Siqi","family":"Zheng","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6121-0384","authenticated-orcid":false,"given":"Zhou","family":"Zhao","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Musiclm: Generating music from text. arXiv preprint arXiv:2301.11325","author":"Agostinelli Andrea","year":"2023","unstructured":"Andrea Agostinelli, Timo I Denk, Zal\u00e1n Borsos, Jesse Engel, Mauro Verzetti, Antoine Caillon, Qingqing Huang, Aren Jansen, Adam Roberts, Marco Tagliasacchi, et al. 2023. Musiclm: Generating music from text. arXiv preprint arXiv:2301.11325 (2023)."},{"key":"e_1_3_2_1_2_1","unstructured":"Yatong Bai Trung Dang Dung Tran Kazuhito Koishida and Somayeh Sojoudi. 2024. ConsistencyTTA: Accelerating Diffusion-Based Text-to-Audio Generation with Consistency Distillation. arXiv:2309.10740 [cs.SD] https:\/\/arxiv.org\/abs\/ 2309.10740"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447265"},{"key":"e_1_3_2_1_4_1","unstructured":"Jade Copet Felix Kreuk Itai Gat Tal Remez David Kant Gabriel Synnaeve Yossi Adi and Alexandre D\u00e9fossez. 2023. Simple and Controllable Music Generation. arXiv:2306.05284 [cs.SD]"},{"key":"e_1_3_2_1_5_1","volume-title":"Emovie: A mandarin emotion speech dataset with a simple emotional text-to-speech model. arXiv preprint arXiv:2106.09317","author":"Cui Chenye","year":"2021","unstructured":"Chenye Cui, Yi Ren, Jinglin Liu, Feiyang Chen, Rongjie Huang, Ming Lei, and Zhou Zhao. 2021. Emovie: A mandarin emotion speech dataset with a simple emotional text-to-speech model. arXiv preprint arXiv:2106.09317 (2021)."},{"key":"e_1_3_2_1_6_1","volume-title":"Riffusion-Stable diffusion for real-time music generation. URL https:\/\/riffusion. com","author":"Forsgren Seth","year":"2022","unstructured":"Seth Forsgren and Hayk Martiros. 2022. Riffusion-Stable diffusion for real-time music generation. URL https:\/\/riffusion. com (2022)."},{"key":"e_1_3_2_1_7_1","unstructured":"Peng Gao Le Zhuo Ziyi Lin Chris Liu Junsong Chen Ruoyi Du Enze Xie Xu Luo Longtian Qiu Yuhang Zhang et al. 2024. Lumina-T2X: Transforming Text into Any Modality Resolution and Duration via Flow-based Large Diffusion Transformers. arXiv preprint arXiv:2405.05945 (2024)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Deepanway Ghosal Navonil Majumder Ambuj Mehrish and Soujanya Poria. 2023. Text-to-Audio Generation using Instruction-Tuned LLM and Latent Diffusion Model. arXiv:2304.13731 [eess.AS]","DOI":"10.1145\/3581783.3612348"},{"key":"e_1_3_2_1_9_1","volume-title":"Proc. of NeurIPS","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. [n. d.]. Denoising Diffusion Probabilistic Models. In Proc. of NeurIPS (2020)."},{"key":"e_1_3_2_1_10_1","volume-title":"NeurIPS 2021 workshop on deep generative models and downstream applications.","author":"Ho Jonathan","year":"2021","unstructured":"Jonathan Ho and Tim Salimans. 2021. Classifier-free diffusion guidance. In NeurIPS 2021 workshop on deep generative models and downstream applications."},{"key":"e_1_3_2_1_11_1","unstructured":"Wenyi Hong Ming Ding Wendi Zheng Xinghan Liu and Jie Tang. [n.d.]. CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers. ([n. d.]). arXiv:2205.15868"},{"key":"e_1_3_2_1_12_1","unstructured":"Jiawei Huang Yi Ren Rongjie Huang Dongchao Yang Zhenhui Ye Chen Zhang Jinglin Liu Xiang Yin Zejun Ma and Zhou Zhao. 2023. Make-An-Audio 2: Temporal-Enhanced Text-to-Audio Generation. arXiv:2305.18474 [cs.SD]"},{"key":"e_1_3_2_1_13_1","unstructured":"Rongjie Huang Jiawei Huang Dongchao Yang Yi Ren Luping Liu Mingze Li Zhenhui Ye Jinglin Liu Xiang Yin and Zhou Zhao. 2023. Make-An-Audio: Text-To-Audio Generation with Prompt-Enhanced Diffusion Models. arXiv:2301.12661 [cs.SD]"},{"key":"e_1_3_2_1_14_1","volume-title":"Jun Wang, Dan Su, Dong Yu, Yi Ren, and Zhou Zhao.","author":"Huang Rongjie","year":"2022","unstructured":"Rongjie Huang, Max WY Lam, Jun Wang, Dan Su, Dong Yu, Yi Ren, and Zhou Zhao. 2022. FastDiff: A Fast Conditional Diffusion Model for High-Quality Speech Synthesis. arXiv preprint arXiv:2204.09934 (2022)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.437"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.508"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547855"},{"key":"e_1_3_2_1_18_1","first-page":"26565","article-title":"Elucidating the design space of diffusion-based generative models","volume":"35","author":"Karras Tero","year":"2022","unstructured":"Tero Karras, Miika Aittala, Timo Aila, and Samuli Laine. 2022. Elucidating the design space of diffusion-based generative models. Advances in Neural Information Processing Systems 35 (2022), 26565--26577.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_19_1","volume-title":"AudioCaps: Generating Captions for Audios in the Wild. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Kim Chris Dongjoo","year":"2019","unstructured":"Chris Dongjoo Kim, Byeongchang Kim, Hyunmin Lee, and Gunhee Kim. [n. d.]. AudioCaps: Generating Captions for Audios in the Wild. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers) (2019). 119--132."},{"key":"e_1_3_2_1_20_1","unstructured":"Felix Kreuk Gabriel Synnaeve Adam Polyak Uriel Singer Alexandre D\u00e9fossez Jade Copet Devi Parikh Yaniv Taigman and Yossi Adi. [n. d.]. AudioGen: Textually Guided Audio Generation."},{"key":"e_1_3_2_1_21_1","unstructured":"Felix Kreuk Gabriel Synnaeve Adam Polyak Uriel Singer Alexandre D\u00e9fossez Jade Copet Devi Parikh Yaniv Taigman and Yossi Adi. 2023. AudioGen: Textually Guided Audio Generation. arXiv:2209.15352 [cs.SD]"},{"key":"e_1_3_2_1_22_1","unstructured":"Sang-gil Lee Wei Ping Boris Ginsburg Bryan Catanzaro and Sungroh Yoon. BigVGAN: A Universal Neural Vocoder with Large-Scale Training."},{"key":"e_1_3_2_1_23_1","unstructured":"Haohe Liu Zehua Chen Yi Yuan Xinhao Mei Xubo Liu Danilo Mandic Wenwu Wang and Mark D. Plumbley. [n. d.]. AudioLDM: Text-to-Audio Generation with Latent Diffusion Models."},{"key":"e_1_3_2_1_24_1","volume-title":"Vit-tts: visual text-to-speech with scalable diffusion transformer. arXiv preprint arXiv:2305.12708","author":"Liu Huadai","year":"2023","unstructured":"Huadai Liu, Rongjie Huang, Xuan Lin, Wenqiang Xu, Maozong Zheng, Hong Chen, Jinzheng He, and Zhou Zhao. 2023. Vit-tts: visual text-to-speech with scalable diffusion transformer. arXiv preprint arXiv:2305.12708 (2023)."},{"key":"e_1_3_2_1_25_1","volume-title":"AudioLDM 2: Learning holistic audio generation with self-supervised pretraining. arXiv preprint arXiv:2308.05734","author":"Liu Haohe","year":"2023","unstructured":"Haohe Liu, Qiao Tian, Yi Yuan, Xubo Liu, Xinhao Mei, Qiuqiang Kong, Yuping Wang, Wenwu Wang, Yuxuan Wang, and Mark D Plumbley. 2023. AudioLDM 2: Learning holistic audio generation with self-supervised pretraining. arXiv preprint arXiv:2308.05734 (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Liu Xingchao","year":"2023","unstructured":"Xingchao Liu, Xiwen Zhang, Jianzhu Ma, Jian Peng, et al. 2023. Instaflow: One step is enough for high-quality diffusion-based text-to-image generation. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_27_1","first-page":"5775","article-title":"Dpm-solver: A fast ode solver for diffusion probabilistic model sampling in around 10 steps","volume":"35","author":"Lu Cheng","year":"2022","unstructured":"Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu. 2022. Dpm-solver: A fast ode solver for diffusion probabilistic model sampling in around 10 steps. Advances in Neural Information Processing Systems 35 (2022), 5775--5787.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_28_1","volume-title":"Latent consistency models: Synthesizing high-resolution images with few-step inference. arXiv preprint arXiv:2310.04378","author":"Luo Simian","year":"2023","unstructured":"Simian Luo, Yiqin Tan, Longbo Huang, Jian Li, and Hang Zhao. 2023. Latent consistency models: Synthesizing high-resolution images with few-step inference. arXiv preprint arXiv:2310.04378 (2023)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645316"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3583451"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01374"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","unstructured":"Alex Nichol Prafulla Dhariwal Aditya Ramesh Pranav Shyam Pamela Mishkin Bob McGrew Ilya Sutskever and Mark Chen. [n. d.]. GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. https:\/\/doi.org\/10.48550\/ARXIV.2112.10741","DOI":"10.48550\/ARXIV.2112.10741"},{"key":"e_1_3_2_1_33_1","volume-title":"Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952","author":"Podell Dustin","year":"2023","unstructured":"Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas M\u00fcller, Joe Penna, and Robin Rombach. 2023. Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_35_1","unstructured":"Chitwan Saharia William Chan Saurabh Saxena Lala Li Jay Whang Emily Denton Seyed Kamyar Seyed Ghasemipour Burcu Karagol Ayan S Sara Mahdavi Rapha Gontijo Lopes et al. [n. d.]. Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding. ([n. d.]). arXiv:2205.11487"},{"key":"e_1_3_2_1_36_1","volume-title":"Progressive distillation for fast sampling of diffusion models. arXiv preprint arXiv:2202.00512","author":"Salimans Tim","year":"2022","unstructured":"Tim Salimans and Jonathan Ho. 2022. Progressive distillation for fast sampling of diffusion models. arXiv preprint arXiv:2202.00512 (2022)."},{"key":"e_1_3_2_1_37_1","volume-title":"Mo\u00fbsai: Text-to-Music Generation with Long-Context Latent Diffusion. arXiv:2301.11757 [cs.CL]","author":"Schneider Flavio","year":"2023","unstructured":"Flavio Schneider, Zhijing Jin, and Bernhard Sch\u00f6lkopf. 2023. Mo\u00fbsai: Text-to-Music Generation with Long-Context Latent Diffusion. arXiv:2301.11757 [cs.CL]"},{"key":"e_1_3_2_1_38_1","volume-title":"Glu variants improve transformer. arXiv preprint arXiv:2002.05202","author":"Shazeer Noam","year":"2020","unstructured":"Noam Shazeer. 2020. Glu variants improve transformer. arXiv preprint arXiv:2002.05202 (2020)."},{"key":"e_1_3_2_1_39_1","unstructured":"Uriel Singer Adam Polyak Thomas Hayes Xi Yin Jie An Songyang Zhang Qiyuan Hu Harry Yang Oron Ashual Oran Gafni et al. [n. d.]. Make-a-Video: Text-to-video Generation without Text-Video Data. ([n. d.]). arXiv:2209.14792"},{"key":"e_1_3_2_1_40_1","volume-title":"Consistency models. arXiv preprint arXiv:2303.01469","author":"Song Yang","year":"2023","unstructured":"Yang Song, Prafulla Dhariwal, Mark Chen, and Ilya Sutskever. 2023. Consistency models. arXiv preprint arXiv:2303.01469 (2023)."},{"key":"e_1_3_2_1_41_1","volume-title":"Generative modeling by estimating gradients of the data distribution. Advances in neural information processing systems 32","author":"Song Yang","year":"2019","unstructured":"Yang Song and Stefano Ermon. 2019. Generative modeling by estimating gradients of the data distribution. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_42_1","volume-title":"Improved techniques for training scorebased generative models. Advances in neural information processing systems 33","author":"Song Yang","year":"2020","unstructured":"Yang Song and Stefano Ermon. 2020. Improved techniques for training scorebased generative models. Advances in neural information processing systems 33 (2020), 12438--12448."},{"key":"e_1_3_2_1_43_1","volume-title":"Score-based generative modeling through stochastic differential equations. arXiv preprint arXiv:2011.13456","author":"Song Yang","year":"2020","unstructured":"Yang Song, Jascha Sohl-Dickstein, Diederik P Kingma, Abhishek Kumar, Stefano Ermon, and Ben Poole. 2020. Score-based generative modeling through stochastic differential equations. arXiv preprint arXiv:2011.13456 (2020)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127063"},{"key":"e_1_3_2_1_45_1","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar Aurelien Rodriguez Armand Joulin Edouard Grave and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. arXiv:2302.13971 [cs.CL]"},{"key":"e_1_3_2_1_46_1","unstructured":"Aaron Van Den Oord Oriol Vinyals et al. 2017. Neural discrete representation learning. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_47_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2023. Attention Is All You Need. arXiv:1706.03762 [cs.CL]"},{"key":"e_1_3_2_1_48_1","volume-title":"Videolcm: Video latent consistency model. arXiv preprint arXiv:2312.09109","author":"Wang Xiang","year":"2023","unstructured":"Xiang Wang, Shiwei Zhang, Han Zhang, Yu Liu, Yingya Zhang, Changxin Gao, and Nong Sang. 2023. Videolcm: Video latent consistency model. arXiv preprint arXiv:2312.09109 (2023)."},{"key":"e_1_3_2_1_49_1","unstructured":"Dongchao Yang Jianwei Yu Helin Wang Wen Wang Chao Weng Yuexian Zou and Dong Yu. [n. d.]. Diffsound: Discrete Diffusion Model for Text-to-sound Generation."},{"key":"e_1_3_2_1_50_1","volume-title":"Diffsound: Discrete diffusion model for text-to-sound generation. arXiv preprint arXiv:2207.09983","author":"Yang Dongchao","year":"2022","unstructured":"Dongchao Yang, Jianwei Yu, Helin Wang, Wen Wang, Chao Weng, Yuexian Zou, and Dong Yu. 2022. Diffsound: Discrete diffusion model for text-to-sound generation. arXiv preprint arXiv:2207.09983 (2022)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612061"},{"key":"e_1_3_2_1_52_1","volume-title":"Root mean square layer normalization. Advances in Neural Information Processing Systems 32","author":"Zhang Biao","year":"2019","unstructured":"Biao Zhang and Rico Sennrich. 2019. Root mean square layer normalization. Advances in Neural Information Processing Systems 32 (2019)."},{"key":"e_1_3_2_1_53_1","volume-title":"Fast sampling of diffusion models with exponential integrator. arXiv preprint arXiv:2204.13902","author":"Zhang Qinsheng","year":"2022","unstructured":"Qinsheng Zhang and Yongxin Chen. 2022. Fast sampling of diffusion models with exponential integrator. arXiv preprint arXiv:2204.13902 (2022)."},{"key":"e_1_3_2_1_54_1","unstructured":"Le Zhuo Ruoyi Du Han Xiao Yangguang Li Dongyang Liu Rongjie Huang Wenze Liu Lirui Zhao Fu-Yun Wang Zhanyu Ma et al. 2024. Lumina-Next: Making Lumina-T2X Stronger and Faster with Next-DiT. arXiv preprint arXiv:2406.18583 (2024)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681072","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681072","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:52Z","timestamp":1750294672000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681072"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":54,"alternative-id":["10.1145\/3664647.3681072","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681072","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}