{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:27:13Z","timestamp":1775230033915,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Ministry of Education, Singapore","award":["T2MOE2008"],"award-info":[{"award-number":["T2MOE2008"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612348","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"3590-3598","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":78,"title":["Text-to-Audio Generation using Instruction Guided Latent Diffusion Model"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3858-4449","authenticated-orcid":false,"given":"Deepanway","family":"Ghosal","sequence":"first","affiliation":[{"name":"Singapore University of Technology and Design, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1449-617X","authenticated-orcid":false,"given":"Navonil","family":"Majumder","sequence":"additional","affiliation":[{"name":"Singapore University of Technology and Design, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4240-9915","authenticated-orcid":false,"given":"Ambuj","family":"Mehrish","sequence":"additional","affiliation":[{"name":"Singapore University of Technology and Design, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3167-2208","authenticated-orcid":false,"given":"Soujanya","family":"Poria","sequence":"additional","affiliation":[{"name":"Singapore University of Technology and Design, Singapore, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"et al. Musiclm: Generating music from text. arXiv preprint arXiv:2301.11325","author":"Agostinelli Andrea","year":"2023","unstructured":"Andrea Agostinelli, Timo I Denk, Zal\u00e1n Borsos, Jesse Engel, Mauro Verzetti, An-toine Caillon, Qingqing Huang, Aren Jansen, Adam Roberts, Marco Tagliasacchi, et al. Musiclm: Generating music from text. arXiv preprint arXiv:2301.11325, 2023."},{"key":"e_1_3_2_1_2_1","volume-title":"Wavegrad: Estimating gradients for waveform generation. arXiv preprint arXiv:2009.00713","author":"Chen Nanxin","year":"2020","unstructured":"Nanxin Chen, Yu Zhang, Heiga Zen, Ron J Weiss, Mohammad Norouzi, and William Chan. Wavegrad: Estimating gradients for waveform generation. arXiv preprint arXiv:2009.00713, 2020."},{"key":"e_1_3_2_1_3_1","unstructured":"Hyung Won Chung Le Hou Shayne Longpre Barret Zoph Yi Tay William Fedus Eric Li Xuezhi Wang Mostafa Dehghani Siddhartha Brahma Albert Webson Shixiang Shane Gu Zhuyun Dai Mirac Suzgun Xinyun Chen Aakanksha Chowdhery Sharan Narang Gaurav Mishra Adams Yu Vincent Zhao Yanping Huang Andrew Dai Hongkun Yu Slav Petrov Ed H. Chi Jeff Dean Jacob Devlin Adam Roberts Denny Zhou Quoc V. Le and Jason Wei. Scaling instruction-finetuned language models 2022. URL https:\/\/arxiv.org\/abs\/2210.11416."},{"key":"e_1_3_2_1_4_1","volume-title":"Why can gpt learn in-context? language models secretly perform gradient descent as meta-optimizers. ArXiv, abs\/2212.10559","author":"Dai Damai","year":"2022","unstructured":"Damai Dai, Yutao Sun, Li Dong, Yaru Hao, Zhifang Sui, and Furu Wei. Why can gpt learn in-context? language models secretly perform gradient descent as meta-optimizers. ArXiv, abs\/2212.10559, 2022."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00684"},{"key":"e_1_3_2_1_7_1","volume-title":"NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications","author":"Ho Jonathan","year":"2021","unstructured":"Jonathan Ho and Tim Salimans. Classifier-free diffusion guidance. NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications, 2021."},{"key":"e_1_3_2_1_8_1","volume-title":"Jun Wang, Dan Su, Dong Yu, Yi Ren, and Zhou Zhao. Fastdiff: A fast conditional diffusion model for high-quality speech synthesis. arXiv preprint arXiv:2204.09934","author":"Huang Rongjie","year":"2022","unstructured":"Rongjie Huang, Max WY Lam, Jun Wang, Dan Su, Dong Yu, Yi Ren, and Zhou Zhao. Fastdiff: A fast conditional diffusion model for high-quality speech synthesis. arXiv preprint arXiv:2204.09934, 2022."},{"key":"e_1_3_2_1_9_1","first-page":"5967","volume-title":"2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Isola Phillip","year":"2016","unstructured":"Phillip Isola, Jun-Yan Zhu, Tinghui Zhou, and Alexei A. Efros. Image-to-image translation with conditional adversarial networks. 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pages 5967--5976, 2016."},{"key":"e_1_3_2_1_10_1","volume-title":"Byoung Jin Choi, and Nam Soo Kim. Diff-tts: A denoising diffusion model for text-to-speech. arXiv preprint arXiv:2104.01409","author":"Jeong Myeonghun","year":"2021","unstructured":"Myeonghun Jeong, Hyeongju Kim, Sung Jun Cheon, Byoung Jin Choi, and Nam Soo Kim. Diff-tts: A denoising diffusion model for text-to-speech. arXiv preprint arXiv:2104.01409, 2021."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2219"},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Kim Chris Dongjoo","year":"2019","unstructured":"Chris Dongjoo Kim, Byeongchang Kim, Hyunmin Lee, and Gunhee Kim. Audiocaps: Generating captions for audios in the wild. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pages 119--132, 2019."},{"key":"e_1_3_2_1_13_1","volume-title":"Kingma and Max Welling. Auto-encoding variational bayes. CoRR, abs\/1312.6114","author":"Diederik","year":"2013","unstructured":"Diederik P. Kingma and Max Welling. Auto-encoding variational bayes. CoRR, abs\/1312.6114, 2013."},{"key":"e_1_3_2_1_14_1","first-page":"17022","article-title":"Generative adversarial networks for efficient and high fidelity speech synthesis","volume":"33","author":"Kong Jungil","year":"2020","unstructured":"Jungil Kong, Jaehyeon Kim, and Jaekyoung Bae. Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Advances in Neural Information Processing Systems, 33:17022--17033, 2020.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_15_1","volume-title":"International Society for Music Information Retrieval Conference","author":"Kong Qiuqiang","year":"2021","unstructured":"Qiuqiang Kong, Yin Cao, Haohe Liu, Keunwoo Choi, and Yuxuan Wang. Decoupling magnitude and phase estimation with deep resunet for music source separation. In International Society for Music Information Retrieval Conference, 2021."},{"key":"e_1_3_2_1_16_1","volume-title":"Diffwave: A versatile diffusion model for audio synthesis. arXiv preprint arXiv:2009.09761","author":"Kong Zhifeng","year":"2020","unstructured":"Zhifeng Kong, Wei Ping, Jiaji Huang, Kexin Zhao, and Bryan Catanzaro. Diffwave: A versatile diffusion model for audio synthesis. arXiv preprint arXiv:2009.09761, 2020."},{"key":"e_1_3_2_1_17_1","volume-title":"Audiogen: Textually guided audio generation. ArXiv, abs\/2209.15352","author":"Kreuk Felix","year":"2022","unstructured":"Felix Kreuk, Gabriel Synnaeve, Adam Polyak, Uriel Singer, Alexandre D'efossez, Jade Copet, Devi Parikh, Yaniv Taigman, and Yossi Adi. Audiogen: Textually guided audio generation. ArXiv, abs\/2209.15352, 2022."},{"key":"e_1_3_2_1_18_1","volume-title":"AudioLDM: Text-to-audio generation with latent diffusion models. ArXiv, abs\/2301.12503","author":"Liu Haohe","year":"2023","unstructured":"Haohe Liu, Zehua Chen, Yi Yuan, Xinhao Mei, Xubo Liu, Danilo P. Mandic, Wenwu Wang, and Mark D. Plumbley. AudioLDM: Text-to-audio generation with latent diffusion models. ArXiv, abs\/2301.12503, 2023."},{"key":"e_1_3_2_1_19_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. ArXiv, abs\/1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. Roberta: A robustly optimized bert pretraining approach. ArXiv, abs\/1907.11692, 2019."},{"key":"e_1_3_2_1_20_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101, 2017."},{"key":"e_1_3_2_1_21_1","first-page":"659","volume-title":"2021 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","author":"Lu Yen-Ju","year":"2021","unstructured":"Yen-Ju Lu, Yu Tsao, and Shinji Watanabe. A study on speech enhancement based on diffusion probabilistic model. In 2021 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC), pages 659--666, 2021."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746901"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746901"},{"key":"e_1_3_2_1_24_1","volume-title":"Wavcaps: A chatgpt-assisted weakly-labelled audio captioning dataset for audio-language multimodal research. arXiv preprint arXiv:2303.17395","author":"Mei Xinhao","year":"2023","unstructured":"Xinhao Mei, Chutong Meng, Haohe Liu, Qiuqiang Kong, Tom Ko, Chengqi Zhao, Mark D Plumbley, Yuexian Zou, and Wenwu Wang. Wavcaps: A chatgpt-assisted weakly-labelled audio captioning dataset for audio-language multimodal research. arXiv preprint arXiv:2303.17395, 2023."},{"key":"e_1_3_2_1_25_1","volume-title":"Wavenet: A generative model for raw audio. arXiv preprint arXiv:1609.03499","author":"van den Oord Aaron","year":"2016","unstructured":"Aaron van den Oord, Sander Dieleman, Heiga Zen, Karen Simonyan, Oriol Vinyals, Alex Graves, Nal Kalchbrenner, Andrew Senior, and Koray Kavukcuoglu. Wavenet: A generative model for raw audio. arXiv preprint arXiv:1609.03499, 2016."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"e_1_3_2_1_27_1","first-page":"8599","volume-title":"International Conference on Machine Learning","author":"Popov Vadim","year":"2021","unstructured":"Vadim Popov, Ivan Vovk, Vladimir Gogoryan, Tasnima Sadekova, and Mikhail Kudinov. Grad-tts: A diffusion probabilistic model for text-to-speech. In International Conference on Machine Learning, pages 8599--8608. PMLR, 2021."},{"key":"e_1_3_2_1_28_1","volume-title":"Diffusion-based voice conversion with fast maximum likelihood sampling scheme. arXiv preprint arXiv:2109.13821","author":"Popov Vadim","year":"2021","unstructured":"Vadim Popov, Ivan Vovk, Vladimir Gogoryan, Tasnima Sadekova, Mikhail Kudinov, and Jiansheng Wei. Diffusion-based voice conversion with fast maximum likelihood sampling scheme. arXiv preprint arXiv:2109.13821, 2021."},{"key":"e_1_3_2_1_29_1","volume-title":"Srtnet: Time domain speech enhancement via stochastic refinement. arXiv preprint arXiv:2210.16805","author":"Qiu Zhibin","year":"2022","unstructured":"Zhibin Qiu, Mengfan Fu, Yinfeng Yu, LiLi Yin, Fuchun Sun, and Hao Huang. Srtnet: Time domain speech enhancement via stochastic refinement. arXiv preprint arXiv:2210.16805, 2022."},{"issue":"140","key":"e_1_3_2_1_30_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of Machine Learning Research, 21(140):1--67, 2020. URL http:\/\/jmlr.org\/papers\/v21\/20-074.html.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_31_1","volume-title":"Zero-shot text-to-image generation. ArXiv, abs\/2102.12092","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. Zero-shot text-to-image generation. ArXiv, abs\/2102.12092, 2021."},{"key":"e_1_3_2_1_32_1","volume-title":"Hierarchical text-conditional image generation with clip latents. ArXiv, abs\/2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. Hierarchical text-conditional image generation with clip latents. ArXiv, abs\/2204.06125, 2022."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"e_1_3_2_1_35_1","first-page":"36479","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume":"35","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al. Photorealistic text-to-image diffusion models with deep language understanding. Advances in Neural Information Processing Systems, 35: 36479--36494, 2022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2655045"},{"key":"e_1_3_2_1_37_1","volume-title":"Universal speech enhancement with score-based diffusion. arXiv preprint arXiv:2206.03065","author":"Serr\u00e0 Joan","year":"2022","unstructured":"Joan Serr\u00e0, Santiago Pascual, Jordi Pons, R Oguz Araz, and Davide Scaini. Universal speech enhancement with score-based diffusion. arXiv preprint arXiv:2206.03065, 2022."},{"key":"e_1_3_2_1_38_1","volume-title":"Denoising diffusion implicit models. ArXiv, abs\/2010.02502","author":"Song Jiaming","year":"2020","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. Denoising diffusion implicit models. ArXiv, abs\/2010.02502, 2020."},{"key":"e_1_3_2_1_39_1","volume-title":"Learning from between-class examples for deep sound recognition. CoRR, abs\/1711.10282","author":"Tokozume Yuji","year":"2017","unstructured":"Yuji Tokozume, Yoshitaka Ushiku, and Tatsuya Harada. Learning from between-class examples for deep sound recognition. CoRR, abs\/1711.10282, 2017. URL http:\/\/arxiv.org\/abs\/1711.10282."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSA.2002.800560"},{"key":"e_1_3_2_1_41_1","volume-title":"https:\/\/en.wikipedia.org\/wiki\/Tango","year":"2021","unstructured":"Wikipedia. Tango. https:\/\/en.wikipedia.org\/wiki\/Tango, 2021. [Online; accessed 21-April-2023]."},{"key":"e_1_3_2_1_42_1","volume-title":"Tango music. https:\/\/en.wikipedia.org\/wiki\/Tango_music","year":"2021","unstructured":"Wikipedia. Tango music. https:\/\/en.wikipedia.org\/wiki\/Tango_music, 2021."},{"key":"e_1_3_2_1_43_1","volume-title":"Diffsound: Discrete diffusion model for text-to-sound generation. arXiv preprint arXiv:2207.09983","author":"Yang Dongchao","year":"2022","unstructured":"Dongchao Yang, Jianwei Yu, Helin Wang, Wen Wang, Chao Weng, Yuexian Zou, and Dong Yu. Diffsound: Discrete diffusion model for text-to-sound generation. arXiv preprint arXiv:2207.09983, 2022."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612348","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612348","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:59:33Z","timestamp":1755820773000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612348"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":43,"alternative-id":["10.1145\/3581783.3612348","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612348","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}