{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:26:35Z","timestamp":1775229995202,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":25,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Microsoft Foundational Model Research Grant"},{"name":"Ministry of Education, Singapore","award":["MOE-T2EP20220-0017"],"award-info":[{"award-number":["MOE-T2EP20220-0017"]}]},{"name":"AI Singapore Governance","award":["AISG3-GV-2023-010"],"award-info":[{"award-number":["AISG3-GV-2023-010"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681688","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"564-572","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":38,"title":["Tango 2: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1449-617X","authenticated-orcid":false,"given":"Navonil","family":"Majumder","sequence":"first","affiliation":[{"name":"Singapore University of Technology and Design, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5252-0763","authenticated-orcid":false,"given":"Chia-Yu","family":"Hung","sequence":"additional","affiliation":[{"name":"Singapore University of Technology and Design, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3858-4449","authenticated-orcid":false,"given":"Deepanway","family":"Ghosal","sequence":"additional","affiliation":[{"name":"Singapore University of Technology and Design, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5546-5217","authenticated-orcid":false,"given":"Wei-Ning","family":"Hsu","sequence":"additional","affiliation":[{"name":"Meta AI, New York, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0767-6703","authenticated-orcid":false,"given":"Rada","family":"Mihalcea","sequence":"additional","affiliation":[{"name":"University of Michigan, Ann Arbor, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6924-7931","authenticated-orcid":false,"given":"Soujanya","family":"Poria","sequence":"additional","affiliation":[{"name":"Singapore University of Technology and Design, Singapore, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"James Betker Gabriel Goh Li Jing - TimBrooks Jianfeng Wang Linjie Li - LongOuyang - JuntangZhuang - JoyceLee - YufeiGuo - WesamManassra - PrafullaDhariwal - CaseyChu - YunxinJiao and Aditya Ramesh. [n. d.]. Improving Image Generation with Better Captions. https:\/\/api.semanticscholar.org\/CorpusID:264403242"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Zal\u00e1n Borsos Rapha\u00ebl Marinier Damien Vincent Eugene Kharitonov Olivier Pietquin Matt Sharifi Dominik Roblek Olivier Teboul David Grangier Marco Tagliasacchi et al. 2023. Audiolm: a language modeling approach to audio generation. IEEE\/ACM Transactions on Audio Speech and Language Processing (2023).","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688253"},{"key":"e_1_3_2_1_4_1","volume-title":"Text-to-audio generation using instruction-tuned llm and latent diffusion model. arXiv preprint arXiv:2304.13731","author":"Ghosal Deepanway","year":"2023","unstructured":"Deepanway Ghosal, Navonil Majumder, Ambuj Mehrish, and Soujanya Poria. 2023. Text-to-audio generation using instruction-tuned llm and latent diffusion model. arXiv preprint arXiv:2304.13731 (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"Text-to-Audio Generation using Instruction Tuned LLM and Latent Diffusion Model. arXiv preprint arXiv:2304.13731","author":"Ghosal Deepanway","year":"2023","unstructured":"Deepanway Ghosal, Navonil Majumder, Ambuj Mehrish, and Soujanya Poria. 2023. Text-to-Audio Generation using Instruction Tuned LLM and Latent Diffusion Model. arXiv preprint arXiv:2304.13731 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"Armand Joulin, and Ishan Misra.","author":"Girdhar Rohit","year":"2023","unstructured":"Rohit Girdhar, Alaaeldin El-Nouby, Zhuang Liu, Mannat Singh, Kalyan Vasudev Alwala, Armand Joulin, and Ishan Misra. 2023. ImageBind: One Embedding Space To Bind Them All. In CVPR."},{"key":"e_1_3_2_1_7_1","volume-title":"Make-an-audio: Text-to-audio generation with prompt-enhanced diffusion models. arXiv preprint arXiv:2301.12661","author":"Huang Rongjie","year":"2023","unstructured":"Rongjie Huang, Jiawei Huang, Dongchao Yang, Yi Ren, Luping Liu, Mingze Li, Zhenhui Ye, Jinglin Liu, Xiang Yin, and Zhou Zhao. 2023. Make-an-audio: Text-to-audio generation with prompt-enhanced diffusion models. arXiv preprint arXiv:2301.12661 (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"Audiogen: Textually guided audio generation. arXiv preprint arXiv:2209.15352","author":"Kreuk Felix","year":"2022","unstructured":"Felix Kreuk, Gabriel Synnaeve, Adam Polyak, Uriel Singer, Alexandre D\u00e9fossez, Jade Copet, Devi Parikh, Yaniv Taigman, and Yossi Adi. 2022. Audiogen: Textually guided audio generation. arXiv preprint arXiv:2209.15352 (2022)."},{"key":"e_1_3_2_1_9_1","volume-title":"Towards general text embeddings with multi-stage contrastive learning. arXiv preprint arXiv:2308.03281","author":"Li Zehan","year":"2023","unstructured":"Zehan Li, Xin Zhang, Yanzhao Zhang, Dingkun Long, Pengjun Xie, and Meishan Zhang. 2023. Towards general text embeddings with multi-stage contrastive learning. arXiv preprint arXiv:2308.03281 (2023)."},{"key":"e_1_3_2_1_10_1","volume-title":"BATON: Aligning Text-to-Audio Model with Human Preference Feedback. arxiv: 2402.00744 [cs.SD]","author":"Liao Huan","year":"2024","unstructured":"Huan Liao, Haonan Han, Kai Yang, Tianjiao Du, Rui Yang, Zunnan Xu, Qinmei Xu, Jingquan Liu, Jiasheng Lu, and Xiu Li. 2024. BATON: Aligning Text-to-Audio Model with Human Preference Feedback. arxiv: 2402.00744 [cs.SD]"},{"key":"e_1_3_2_1_11_1","volume-title":"Audioldm: Text-to-audio generation with latent diffusion models. arXiv preprint arXiv:2301.12503","author":"Liu Haohe","year":"2023","unstructured":"Haohe Liu, Zehua Chen, Yi Yuan, Xinhao Mei, Xubo Liu, Danilo Mandic, Wenwu Wang, and Mark D Plumbley. 2023. Audioldm: Text-to-audio generation with latent diffusion models. arXiv preprint arXiv:2301.12503 (2023)."},{"key":"e_1_3_2_1_12_1","volume-title":"AudioLDM 2: Learning holistic audio generation with self-supervised pretraining. arXiv preprint arXiv:2308.05734","author":"Liu Haohe","year":"2023","unstructured":"Haohe Liu, Qiao Tian, Yi Yuan, Xubo Liu, Xinhao Mei, Qiuqiang Kong, Yuping Wang, Wenwu Wang, Yuxuan Wang, and Mark D Plumbley. 2023. AudioLDM 2: Learning holistic audio generation with self-supervised pretraining. arXiv preprint arXiv:2308.05734 (2023)."},{"key":"e_1_3_2_1_13_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_14_1","volume-title":"Mustango: Toward Controllable Text-to-Music Generation. arxiv: 2311.08355 [eess.AS]","author":"Melechovsky Jan","year":"2024","unstructured":"Jan Melechovsky, Zixun Guo, Deepanway Ghosal, Navonil Majumder, Dorien Herremans, and Soujanya Poria. 2024. Mustango: Toward Controllable Text-to-Music Generation. arxiv: 2311.08355 [eess.AS]"},{"key":"e_1_3_2_1_15_1","unstructured":"OpenAI. 2023. DALL\u00b7E 2. https:\/\/openai.com\/dall-e-2"},{"key":"e_1_3_2_1_16_1","unstructured":"OpenAI. 2023. GPT-4. https:\/\/openai.com\/gpt-4"},{"key":"e_1_3_2_1_17_1","unstructured":"OpenAI. 2023. Introducing ChatGPT. https:\/\/openai.com\/blog\/chatgpt"},{"key":"e_1_3_2_1_18_1","unstructured":"Alec Radford Jeff Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners. (2019)."},{"key":"e_1_3_2_1_19_1","unstructured":"Rafael Rafailov Archit Sharma Eric Mitchell Stefano Ermon Christopher D. Manning and Chelsea Finn. 2023. Direct Preference Optimization: Your Language Model is Secretly a Reward Model. arxiv: 2305.18290 [cs.LG]"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_21_1","volume-title":"Audiobox: Unified Audio Generation with Natural Language Prompts. arxiv: 2312.15821 [cs.SD]","author":"Vyas Apoorv","year":"2023","unstructured":"Apoorv Vyas, Bowen Shi, Matthew Le, Andros Tjandra, Yi-Chiao Wu, Baishan Guo, Jiemin Zhang, Xinyue Zhang, Robert Adkins, William Ngan, Jeff Wang, Ivan Cruz, Bapi Akula, Akinniyi Akinyemi, Brian Ellis, Rashel Moritz, Yael Yungster, Alice Rakotoarison, Liang Tan, Chris Summers, Carleigh Wood, Joshua Lane, Mary Williamson, and Wei-Ning Hsu. 2023. Audiobox: Unified Audio Generation with Natural Language Prompts. arxiv: 2312.15821 [cs.SD]"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Bram Wallace Meihua Dang Rafael Rafailov Linqi Zhou Aaron Lou Senthil Purushwalkam Stefano Ermon Caiming Xiong Shafiq Joty and Nikhil Naik. 2023. Diffusion Model Alignment Using Direct Preference Optimization. arxiv: 2311.12908 [cs.CV]","DOI":"10.1109\/CVPR52733.2024.00786"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"e_1_3_2_1_24_1","unstructured":"Zeyu Xie Xuenan Xu Zhizheng Wu and Mengyue Wu. 2024. AudioTime: A Temporally-aligned Audio-text Benchmark Dataset. arxiv: 2407.02857 [cs.SD] https:\/\/arxiv.org\/abs\/2407.02857"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3129994"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681688","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681688","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:50Z","timestamp":1750295870000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681688"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":25,"alternative-id":["10.1145\/3664647.3681688","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681688","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}