{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T11:29:36Z","timestamp":1764588576752,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"the National Science and Technology Major Project","award":["NO. 2023ZD0121201"],"award-info":[{"award-number":["NO. 2023ZD0121201"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62102416, 62102419"],"award-info":[{"award-number":["62102416, 62102419"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"the Key Research and Development Program of Jiangsu Province","award":["BE2023016-3"],"award-info":[{"award-number":["BE2023016-3"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680889","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"10853-10861","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["MM-LDM: Multi-Modal Latent Diffusion Model for Sounding Video Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3010-9432","authenticated-orcid":false,"given":"Mingzhen","family":"Sun","sequence":"first","affiliation":[{"name":"Institute of Automation, Chinese Academy of Science, School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7299-6431","authenticated-orcid":false,"given":"Weining","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Science, School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5606-0702","authenticated-orcid":false,"given":"Yanyuan","family":"Qiao","sequence":"additional","affiliation":[{"name":"University of Adelaide, Adelaide, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0607-8847","authenticated-orcid":false,"given":"Jiahui","family":"Sun","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Science, School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4451-5591","authenticated-orcid":false,"given":"Zihan","family":"Qin","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Science, School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4340-4000","authenticated-orcid":false,"given":"Longteng","family":"Guo","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Science, School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2142-5580","authenticated-orcid":false,"given":"Xinxin","family":"Zhu","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Science, School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0903-9131","authenticated-orcid":false,"given":"Jing","family":"Liu","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Science, School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58583-9_42"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_7"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_1_6_1","volume-title":"Latent video diffusion models for high-fidelity video generation with arbitrary lengths. arXiv preprint arXiv:2211.13221","author":"He Yingqing","year":"2022","unstructured":"Yingqing He, Tianyu Yang, Yong Zhang, Ying Shan, and Qifeng Chen. 2022. Latent video diffusion models for high-fidelity video generation with arbitrary lengths. arXiv preprint arXiv:2211.13221 (2022)."},{"key":"e_1_3_2_1_7_1","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in Neural Information Processing Systems, Vol. 33 (2020), 6840--6851.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_8_1","volume-title":"Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)."},{"key":"e_1_3_2_1_9_1","volume-title":"Make-An-Audio: Text-To-Audio Generation with Prompt-Enhanced Diffusion Models. In International Conference on Machine Learning. 13916--13932","author":"Huang Rongjie","year":"2023","unstructured":"Rongjie Huang, Jiawei Huang, Dongchao Yang, Yi Ren, Luping Liu, Mingze Li, Zhenhui Ye, Jinglin Liu, Xiang Yin, and Zhou Zhao. 2023. Make-An-Audio: Text-To-Audio Generation with Prompt-Enhanced Diffusion Models. In International Conference on Machine Learning. 13916--13932."},{"key":"e_1_3_2_1_10_1","first-page":"21696","article-title":"Variational diffusion models","volume":"34","author":"Kingma Diederik","year":"2021","unstructured":"Diederik Kingma, Tim Salimans, Ben Poole, and Jonathan Ho. 2021. Variational diffusion models. Advances in Neural Information Processing Systems, Vol. 34 (2021), 21696--21707.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_11_1","first-page":"17022","article-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis","volume":"33","author":"Kong Jungil","year":"2020","unstructured":"Jungil Kong, Jaehyeon Kim, and Jaekyoung Bae. 2020. Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Advances in Neural Information Processing Systems, Vol. 33 (2020), 17022--17033.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_12_1","volume-title":"DiffWave: A Versatile Diffusion Model for Audio Synthesis. In International Conference on Learning Representations.","author":"Kong Zhifeng","year":"2021","unstructured":"Zhifeng Kong, Wei Ping, Jiaji Huang, Kexin Zhao, and Bryan Catanzaro. 2021. DiffWave: A Versatile Diffusion Model for Audio Synthesis. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413802"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_3"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01315"},{"volume-title":"AudioLDM: Text-to-Audio Generation with Latent Diffusion Models. In International Conference on Machine Learning. 21450--21474","author":"Liu Haohe","key":"e_1_3_2_1_16_1","unstructured":"Haohe Liu, Zehua Chen, Yi Yuan, Xinhao Mei, Xubo Liu, Danilo P. Mandic, Wenwu Wang, and Mark D. Plumbley. 2023. AudioLDM: Text-to-Audio Generation with Latent Diffusion Models. In International Conference on Machine Learning. 21450--21474."},{"key":"e_1_3_2_1_17_1","volume-title":"Sounding Video Generator: A Unified Framework for Text-guided Sounding Video Generation","author":"Liu Jiawei","year":"2023","unstructured":"Jiawei Liu, Weining Wang, Sihan Chen, Xinxin Zhu, and Jing Liu. 2023. Sounding Video Generator: A Unified Framework for Text-guided Sounding Video Generation. IEEE Transactions on Multimedia (2023), 1--13."},{"key":"e_1_3_2_1_18_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Luo Simian","year":"2024","unstructured":"Simian Luo, Chuanhao Yan, Chenxu Hu, and Hang Zhao. 2024. Diff-foley: Synchronized video-to-audio synthesis with latent diffusion models. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_2_1_20_1","volume-title":"International Conference on Machine Learning. 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. 8748--8763."},{"key":"e_1_3_2_1_21_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00985"},{"key":"e_1_3_2_1_24_1","first-page":"36479","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume":"35","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in Neural Information Processing Systems, Vol. 35 (2022), 36479--36494.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_25_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Sun Mingzhen","year":"2024","unstructured":"Mingzhen Sun, Weining Wang, Zihan Qin, Jiahui Sun, Sihan Chen, and Jing Liu. 2024. GLOBER: Coherent Non-autoregressive Video Generation via GLOBal Guided Video DecodER. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01796"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3413599"},{"key":"e_1_3_2_1_28_1","volume-title":"Advances in Neural Information Processing Systems","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in Neural Information Processing Systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_29_1","volume-title":"International Conference on Learning Representations.","author":"Villegas Ruben","year":"2023","unstructured":"Ruben Villegas, Mohammad Babaeizadeh, Pieter-Jan Kindermans, Hernan Moraldo, Han Zhang, Mohammad Taghi Saffar, Santiago Castro, Julius Kunze, and Dumitru Erhan. 2023. Phenaki: Variable Length Video Generation from Open Domain Textual Descriptions. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i14.29475"},{"key":"e_1_3_2_1_31_1","volume-title":"Diffusion Feedback Helps CLIP See Better. arXiv preprint arXiv:2407.20171","author":"Wang Wenxuan","year":"2024","unstructured":"Wenxuan Wang, Quan Sun, Fan Zhang, Yepeng Tang, Jing Liu, and Xinlong Wang. 2024. Diffusion Feedback Helps CLIP See Better. arXiv preprint arXiv:2407.20171 (2024)."},{"key":"e_1_3_2_1_32_1","volume-title":"CMMD: Contrastive Multi-Modal Diffusion for Video-Audio Conditional Modeling. CoRR","author":"Yang Ruihan","year":"2023","unstructured":"Ruihan Yang, Hannes Gamper, and Sebastian Braun. 2023. CMMD: Contrastive Multi-Modal Diffusion for Video-Audio Conditional Modeling. CoRR, Vol. abs\/2312.05412 (2023)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28486"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01770"},{"key":"e_1_3_2_1_35_1","volume-title":"Generating Videos with Dynamics-aware Implicit Generative Adversarial Networks. In International Conference on Learning Representations.","author":"Yu Sihyun","year":"2022","unstructured":"Sihyun Yu, Jihoon Tack, Sangwoo Mo, Hyunsu Kim, Junho Kim, Jung-Woo Ha, and Jinwoo Shin. 2022. Generating Videos with Dynamics-aware Implicit Generative Adversarial Networks. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01242"},{"key":"e_1_3_2_1_37_1","volume-title":"Discrete contrastive diffusion for cross-modal and conditional generation. arXiv preprint arXiv:2206.07771","author":"Zhu Ye","year":"2022","unstructured":"Ye Zhu, Yu Wu, Kyle Olszewski, Jian Ren, Sergey Tulyakov, and Yan Yan. 2022. Discrete contrastive diffusion for cross-modal and conditional generation. arXiv preprint arXiv:2206.07771 (2022)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680889","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680889","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:33Z","timestamp":1750295853000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680889"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":37,"alternative-id":["10.1145\/3664647.3680889","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680889","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}