{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:14:06Z","timestamp":1765343646363,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758284","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:55Z","timestamp":1761377215000},"page":"13266-13272","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["AudioAtlas: A Comprehensive and Balanced Benchmark Towards Movie-Oriented Text-to-Audio Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-1457-5120","authenticated-orcid":false,"given":"Chenxi","family":"Wang","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1290-2568","authenticated-orcid":false,"given":"Yusheng","family":"Dai","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7680-6455","authenticated-orcid":false,"given":"Lei","family":"Sun","sequence":"additional","affiliation":[{"name":"iFLYTEK Research, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2387-0389","authenticated-orcid":false,"given":"Jun","family":"Du","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5575-4940","authenticated-orcid":false,"given":"Jianqing","family":"Gao","sequence":"additional","affiliation":[{"name":"iFLYTEK Research, Hefei, Anhui, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Artlist. [n.d.]. Artlist Sound Effects. https:\/\/artlist.io\/sfx. Accessed: 2025-05-31."},{"key":"e_1_3_2_1_2_1","unstructured":"AudioStock. [n.d.]. AudioStock Sound Effects. https:\/\/www.audiostock.io\/. Accessed: 2025-05-31."},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning. 12345-12355","author":"Bar-Tal Omer","year":"2023","unstructured":"Omer Bar-Tal, Lior Yariv, Yaron Lipman, and Tali Dekel. 2023. MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation. In Proceedings of the 40th International Conference on Machine Learning. 12345-12355."},{"key":"e_1_3_2_1_4_1","volume-title":"Taming Multimodal Joint Training for High-Quality Video-to-Audio Synthesis. arXiv preprint arXiv:2412.15322","author":"Cheng Ho Kei","year":"2024","unstructured":"Ho Kei Cheng, Masato Ishii, Akio Hayakawa, Takashi Shibuya, Alexander Schwing, and Yuki Mitsufuji. 2024. Taming Multimodal Joint Training for High-Quality Video-to-Audio Synthesis. arXiv preprint arXiv:2412.15322 (2024)."},{"key":"e_1_3_2_1_5_1","unstructured":"Yunfei Chu Jin Xu Qian Yang Haojie Wei Xipin Wei Zhifang Guo Yichong Leng Yuanjun Lv Jinzheng He Junyang Lin Chang Zhou and Jingren Zhou. 2024. Qwen2-Audio Technical Report. arXiv preprint arXiv:2407.10759 (2024)."},{"key":"e_1_3_2_1_6_1","first-page":"1","article-title":"Scaling instruction-finetuned language models","volume":"25","author":"Chung Hyung Won","year":"2024","unstructured":"Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Yunxuan Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, et al., 2024. Scaling instruction-finetuned language models. Journal of Machine Learning Research, Vol. 25, 70 (2024), 1-53.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_7_1","volume-title":"Latent Swap Joint Diffusion for Long-Form Audio Generation. arXiv preprint arXiv:2502.05130","author":"Dai Yusheng","year":"2025","unstructured":"Yusheng Dai, Chenxi Wang, Chang Li, Chen Wang, Jun Du, Kewei Li, Ruoyu Wang, Jiefeng Ma, Lei Sun, and Jianqing Gao. 2025. Latent Swap Joint Diffusion for Long-Form Audio Generation. arXiv preprint arXiv:2502.05130 (2025)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"e_1_3_2_1_9_1","volume-title":"Forty-first International Conference on Machine Learning.","author":"Evans Zach","year":"2024","unstructured":"Zach Evans, CJ Carr, Josiah Taylor, Scott H Hawley, and Jordi Pons. 2024. Fast timing-conditioned latent audio diffusion. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02117"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_1_12_1","unstructured":"Deepanway Ghosal Navonil Majumder Ambuj Mehrish and Soujanya Poria. [n.d.]. Text-to-Audio Generation using Instruction-Tuned LLM and Latent Diffusion Model. ([n.d.])."},{"key":"e_1_3_2_1_13_1","volume-title":"Audio Flamingo 2: An Audio-Language Model with Long-Audio Understanding and Expert Reasoning Abilities. arXiv preprint arXiv:2503.03983","author":"Ghosh Sreyan","year":"2025","unstructured":"Sreyan Ghosh, Zhifeng Kong, Sonal Kumar, S Sakshi, Jaehyeon Kim, Wei Ping, Rafael Valle, Dinesh Manocha, and Bryan Catanzaro. 2025. Audio Flamingo 2: An Audio-Language Model with Long-Audio Understanding and Expert Reasoning Abilities. arXiv preprint arXiv:2503.03983 (2025)."},{"key":"e_1_3_2_1_14_1","volume-title":"Audio Generation with Multiple Conditional Diffusion Model. arXiv preprint arXiv:2308.11940","author":"Guo Zhifang","year":"2023","unstructured":"Zhifang Guo, Jianguo Mao, Rui Tao, Long Yan, Kazushige Ouchi, Hong Liu, and Xiangdong Wang. 2023. Audio Generation with Multiple Conditional Diffusion Model. arXiv preprint arXiv:2308.11940 (2023)."},{"key":"e_1_3_2_1_15_1","volume-title":"Taming data and transformers for audio generation. arXiv preprint arXiv:2406.19388","author":"Haji-Ali Moayed","year":"2024","unstructured":"Moayed Haji-Ali, Willi Menapace, Aliaksandr Siarohin, Guha Balakrishnan, Sergey Tulyakov, and Vicente Ordonez. 2024. Taming data and transformers for audio generation. arXiv preprint arXiv:2406.19388 (2024)."},{"key":"e_1_3_2_1_16_1","volume-title":"The Benefit of Temporally-Strong Labels in Audio Event Classification. In ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 366-370","author":"Hershey Shawn","year":"2021","unstructured":"Shawn Hershey, Daniel PW Ellis, Eduardo Fonseca, Aren Jansen, Caroline Liu, R Channing Moore, and Manoj Plakal. 2021. The Benefit of Temporally-Strong Labels in Audio Event Classification. In ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 366-370."},{"key":"e_1_3_2_1_17_1","volume-title":"Make-an-audio 2: Temporal-enhanced text-to-audio generation. arXiv preprint arXiv:2305.18474","author":"Huang Jiawei","year":"2023","unstructured":"Jiawei Huang, Yi Ren, Rongjie Huang, Dongchao Yang, Zhenhui Ye, Chen Zhang, Jinglin Liu, Xiang Yin, Zejun Ma, and Zhou Zhao. 2023. Make-an-audio 2: Temporal-enhanced text-to-audio generation. arXiv preprint arXiv:2305.18474 (2023)."},{"key":"e_1_3_2_1_18_1","volume-title":"FreeAudio: Training-Free Timing Planning for Controllable Long-Form Text-to-Audio Generation. arXiv preprint arXiv:2507.08557","author":"Jiang Yuxuan","year":"2025","unstructured":"Yuxuan Jiang, Zehua Chen, Zeqian Ju, Chang Li, Weibei Dou, and Jun Zhu. 2025. FreeAudio: Training-Free Timing Planning for Controllable Long-Form Text-to-Audio Generation. arXiv preprint arXiv:2507.08557 (2025)."},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Kim Chris Dongjoo","year":"2019","unstructured":"Chris Dongjoo Kim, Byeongchang Kim, Hyunmin Lee, and Gunhee Kim. 2019. Audiocaps: Generating captions for audios in the wild. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers). 119-132."},{"key":"e_1_3_2_1_20_1","volume-title":"Audiogen: Textually guided audio generation. arXiv preprint arXiv:2209.15352","author":"Kreuk Felix","year":"2022","unstructured":"Felix Kreuk, Gabriel Synnaeve, Adam Polyak, Uriel Singer, Alexandre D\u00e9fossez, Jade Copet, Devi Parikh, Yaniv Taigman, and Yossi Adi. 2022. Audiogen: Textually guided audio generation. arXiv preprint arXiv:2209.15352 (2022)."},{"key":"e_1_3_2_1_21_1","volume-title":"Audioldm: Text-to-audio generation with latent diffusion models. arXiv preprint arXiv:2301.12503","author":"Liu Haohe","year":"2023","unstructured":"Haohe Liu, Zehua Chen, Yi Yuan, Xinhao Mei, Xubo Liu, Danilo Mandic, Wenwu Wang, and Mark D Plumbley. 2023. Audioldm: Text-to-audio generation with latent diffusion models. arXiv preprint arXiv:2301.12503 (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"Audioldm 2: Learning holistic audio generation with self-supervised pretraining","author":"Liu Haohe","year":"2024","unstructured":"Haohe Liu, Yi Yuan, Xubo Liu, Xinhao Mei, Qiuqiang Kong, Qiao Tian, Yuping Wang, Wenwu Wang, Yuxuan Wang, and Mark D Plumbley. 2024. Audioldm 2: Learning holistic audio generation with self-supervised pretraining. IEEE\/ACM Transactions on Audio, Speech, and Language Processing (2024)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681688"},{"key":"e_1_3_2_1_24_1","unstructured":"Neuml. 2025. txtai: All-in-one open-source embeddings database for semantic search LLM orchestration and language model workflows. https:\/\/github.com\/neuml\/txtai. Accessed: 2025-05-22."},{"key":"e_1_3_2_1_25_1","unstructured":"Tim Nielsen Justin Drury and Kai Paquin. 2025. Universal Category System (UCS). https:\/\/universalcategorysystem.com\/. Accessed: 2025-05-22."},{"key":"e_1_3_2_1_26_1","volume-title":"AudioX: Diffusion Transformer for Anything-to-Audio Generation. arXiv preprint arXiv:2503.10522","author":"Tian Zeyue","year":"2025","unstructured":"Zeyue Tian, Yizhu Jin, Zhaoyang Liu, Ruibin Yuan, Xu Tan, Qifeng Chen, Wei Xue, and Yike Guo. 2025. AudioX: Diffusion Transformer for Anything-to-Audio Generation. arXiv preprint arXiv:2503.10522 (2025)."},{"key":"e_1_3_2_1_27_1","unstructured":"Andros Tjandra Yi-Chiao Wu Baishan Guo John Hoffman Brian Ellis Apoorv Vyas Bowen Shi Sanyuan Chen Matt Le Nick Zacharov et al. 2025. Meta Audiobox Aesthetics: Unified Automatic Quality Assessment for Speech Music and Sound. arXiv preprint arXiv:2502.05139 (2025)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_29_1","volume-title":"Audio-agent: Leveraging llms for audio generation, editing and composition. arXiv preprint arXiv:2410.03335","author":"Wang Zixuan","year":"2024","unstructured":"Zixuan Wang, Chi-Keung Tang, and Yu-Wing Tai. 2024. Audio-agent: Leveraging llms for audio generation, editing and composition. arXiv preprint arXiv:2410.03335 (2024)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"e_1_3_2_1_31_1","volume-title":"AudioTime: A Temporally-aligned Audio-text Benchmark Dataset. arXiv preprint arXiv:2407.02857","author":"Xie Zeyu","year":"2024","unstructured":"Zeyu Xie, Xuenan Xu, Zhizheng Wu, and Mengyue Wu. 2024a. AudioTime: A Temporally-aligned Audio-text Benchmark Dataset. arXiv preprint arXiv:2407.02857 (2024)."},{"key":"e_1_3_2_1_32_1","volume-title":"Picoaudio: Enabling precise timestamp and frequency controllability of audio events in text-to-audio generation. arXiv preprint arXiv:2407.02869","author":"Xie Zeyu","year":"2024","unstructured":"Zeyu Xie, Xuenan Xu, Zhizheng Wu, and Mengyue Wu. 2024b. Picoaudio: Enabling precise timestamp and frequency controllability of audio events in text-to-audio generation. arXiv preprint arXiv:2407.02869 (2024)."},{"key":"e_1_3_2_1_33_1","volume-title":"Text-to-Audio Grounding: Building Correspondence Between Captions and Sound Events. In ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 471-475","author":"Xu Xuenan","year":"2021","unstructured":"Xuenan Xu, Heinrich Dinkel, Mengyue Wu, and Kai Yu. 2021. Text-to-Audio Grounding: Building Correspondence Between Captions and Sound Events. In ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 471-475."},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the 41st International Conference on Machine Learning. 56704-56721","author":"Yang Ling","year":"2024","unstructured":"Ling Yang, Zhaochen Yu, Chenlin Meng, Minkai Xu, Stefano Ermon, and Bin Cui. 2024. Mastering Text-to-Image Diffusion: Recaptioning, Planning, and Generating with Multimodal LLMs. In Proceedings of the 41st International Conference on Machine Learning. 56704-56721."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758284","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:09:19Z","timestamp":1765343359000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758284"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":34,"alternative-id":["10.1145\/3746027.3758284","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758284","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}