{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T16:45:57Z","timestamp":1777567557328,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":67,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680612","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"6607-6616","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":15,"title":["TAVGBench: Benchmarking Text to Audible-Video Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9239-091X","authenticated-orcid":false,"given":"Yuxin","family":"Mao","sequence":"first","affiliation":[{"name":"Northwestern Polytechnical University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1968-7055","authenticated-orcid":false,"given":"Xuyang","family":"Shen","sequence":"additional","affiliation":[{"name":"OpenNLPLab, Shanghai AI Lab, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8516-0913","authenticated-orcid":false,"given":"Jing","family":"Zhang","sequence":"additional","affiliation":[{"name":"Australian National University, Canberra, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9856-0843","authenticated-orcid":false,"given":"Zhen","family":"Qin","sequence":"additional","affiliation":[{"name":"Tap Tap, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6402-7593","authenticated-orcid":false,"given":"Jinxing","family":"Zhou","sequence":"additional","affiliation":[{"name":"Hefei University of Technology, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7332-8324","authenticated-orcid":false,"given":"Mochu","family":"Xiang","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1404-3610","authenticated-orcid":false,"given":"Yiran","family":"Zhong","sequence":"additional","affiliation":[{"name":"OpenNLPLab, Shanghai AI Lab, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4432-7406","authenticated-orcid":false,"given":"Yuchao","family":"Dai","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_4_1","volume-title":"Valor: Vision-audio-language omni-perception pretraining model and dataset. arXiv preprint arXiv:2304.08345","author":"Chen Sihan","year":"2023","unstructured":"Sihan Chen, Xingjian He, Longteng Guo, Xinxin Zhu, Weining Wang, Jinhui Tang, and Jing Liu. 2023. Valor: Vision-audio-language omni-perception pretraining model and dataset. arXiv preprint arXiv:2304.08345 (2023)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00240"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_1_8_1","volume-title":"Text-to-audio generation using instruction-tuned llm and latent diffusion model. arXiv preprint arXiv:2304.13731","author":"Ghosal Deepanway","year":"2023","unstructured":"Deepanway Ghosal, Navonil Majumder, Ambuj Mehrish, and Soujanya Poria. 2023. Text-to-audio generation using instruction-tuned llm and latent diffusion model. arXiv preprint arXiv:2304.13731 (2023)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"e_1_3_2_1_10_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Guo Yuwei","year":"2023","unstructured":"Yuwei Guo, Ceyuan Yang, Anyi Rao, Zhengyang Liang, Yaohui Wang, Yu Qiao, Maneesh Agrawala, Dahua Lin, and Bo Dai. 2023. AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.27978"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_1_13_1","volume-title":"CLIPScore: A Reference-free Evaluation Metric for Image Captioning. In Conference on Empirical Methods in Natural Language Processing (EMNLP). 7514--7528","author":"Hessel Jack","year":"2021","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan Le Bras, and Yejin Choi. 2021. CLIPScore: A Reference-free Evaluation Metric for Image Captioning. In Conference on Empirical Methods in Natural Language Processing (EMNLP). 7514--7528."},{"key":"e_1_3_2_1_14_1","unstructured":"Jonathan Ho William Chan Chitwan Saharia Jay Whang Ruiqi Gao Alexey Gritsenko Diederik P Kingma Ben Poole Mohammad Norouzi David J Fleet et al. 2022. Imagen video: High definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)."},{"key":"e_1_3_2_1_15_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Conference on Neural Information Processing Systems (NeurIPS) (2020), 6840--6851."},{"key":"e_1_3_2_1_16_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Hong Wenyi","year":"2022","unstructured":"Wenyi Hong, Ming Ding, Wendi Zheng, Xinghan Liu, and Jie Tang. 2022. CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_17_1","volume-title":"Taming Visually Guided Sound Generation. In British Machine Vision Conference (BMVC). BMVA Press.","author":"Iashin Vladimir","year":"2021","unstructured":"Vladimir Iashin and Esa Rahtu. 2021. Taming Visually Guided Sound Generation. In British Machine Vision Conference (BMVC). BMVA Press."},{"key":"e_1_3_2_1_18_1","unstructured":"Will Kay Joao Carreira Karen Simonyan Brian Zhang Chloe Hillier Sudheendra Vijayanarasimhan Fabio Viola Tim Green Trevor Back Paul Natsev et al. 2017. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"e_1_3_2_1_20_1","unstructured":"Chris Dongjoo Kim Byeongchang Kim Hyunmin Lee and Gunhee Kim. 2019. AudioCaps: Generating Captions for Audios in The Wild. In NAACL-HLT."},{"key":"e_1_3_2_1_21_1","volume-title":"AudioGen: Textually Guided Audio Generation. In International Conference on Learning Representations (ICLR).","author":"Kreuk Felix","year":"2022","unstructured":"Felix Kreuk, Gabriel Synnaeve, Adam Polyak, Uriel Singer, Alexandre D\u00e9fossez, Jade Copet, Devi Parikh, Yaniv Taigman, and Yossi Adi. 2022. AudioGen: Textually Guided Audio Generation. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_22_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","volume":"32","author":"Kumar Kundan","year":"2019","unstructured":"Kundan Kumar, Rithesh Kumar, Thibault De Boissiere, Lucas Gestin, Wei Zhen Teoh, Jose Sotelo, Alexandre De Brebisson, Yoshua Bengio, and Aaron C Courville. 2019. Melgan: Generative adversarial networks for conditional waveform synthesis. Conference on Neural Information Processing Systems (NeurIPS), Vol. 32 (2019)."},{"key":"e_1_3_2_1_23_1","volume-title":"Soundini: Sound-guided diffusion for natural video editing. arXiv preprint arXiv:2304.06818","author":"Lee Seung Hyun","year":"2023","unstructured":"Seung Hyun Lee, Sieun Kim, Innfarn Yoo, Feng Yang, Donghyeon Cho, Youngseo Kim, Huiwen Chang, Jinkyu Kim, and Sangpil Kim. 2023. Soundini: Sound-guided diffusion for natural video editing. arXiv preprint arXiv:2304.06818 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"International Conference on Machine Learning (ICML). PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International Conference on Machine Learning (ICML). PMLR, 19730--19742."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12233"},{"key":"e_1_3_2_1_26_1","volume-title":"AudioLDM: Text-to-Audio Generation with Latent Diffusion Models. In International Conference on Learning Representations (ICLR). PMLR, 21450--21474","author":"Liu Haohe","year":"2023","unstructured":"Haohe Liu, Zehua Chen, Yi Yuan, Xinhao Mei, Xubo Liu, Danilo Mandic, Wenwu Wang, and Mark D Plumbley. 2023. AudioLDM: Text-to-Audio Generation with Latent Diffusion Models. In International Conference on Learning Representations (ICLR). PMLR, 21450--21474."},{"key":"e_1_3_2_1_27_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","volume":"36","author":"Luo Simian","year":"2024","unstructured":"Simian Luo, Chuanhao Yan, Chenxu Hu, and Hang Zhao. 2024. Diff-foley: Synchronized video-to-audio synthesis with latent diffusion models. Conference on Neural Information Processing Systems (NeurIPS), Vol. 36 (2024)."},{"key":"e_1_3_2_1_28_1","volume-title":"Contrastive conditional latent diffusion for audio-visual segmentation. arXiv preprint arXiv:2307.16579","author":"Mao Yuxin","year":"2023","unstructured":"Yuxin Mao, Jing Zhang, Mochu Xiang, Yunqiu Lv, Yiran Zhong, and Yuchao Dai. 2023. Contrastive conditional latent diffusion for audio-visual segmentation. arXiv preprint arXiv:2307.16579 (2023)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00094"},{"key":"e_1_3_2_1_30_1","volume-title":"Wavcaps: A chatgpt-assisted weakly-labelled audio captioning dataset for audio-language multimodal research. arXiv preprint arXiv:2303.17395","author":"Mei Xinhao","year":"2023","unstructured":"Xinhao Mei, Chutong Meng, Haohe Liu, Qiuqiang Kong, Tom Ko, Chengqi Zhao, Mark D Plumbley, Yuexian Zou, and Wenwu Wang. 2023. Wavcaps: A chatgpt-assisted weakly-labelled audio captioning dataset for audio-language multimodal research. arXiv preprint arXiv:2303.17395 (2023)."},{"key":"e_1_3_2_1_31_1","volume-title":"Drumgan: Synthesis of drum sounds with timbral feature conditioning using generative adversarial networks. arXiv preprint arXiv:2008.12073","author":"Nistal Javier","year":"2020","unstructured":"Javier Nistal, Stefan Lattner, and Gael Richard. 2020. Drumgan: Synthesis of drum sounds with timbral feature conditioning using generative adversarial networks. arXiv preprint arXiv:2008.12073 (2020)."},{"key":"e_1_3_2_1_32_1","unstructured":"OpenAI. 2022. ChatGPT: OpenAI's Conversational AI. https:\/\/openai.com\/chatgpt."},{"key":"e_1_3_2_1_33_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","volume":"32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Conference on Neural Information Processing Systems (NeurIPS), Vol. 32 (2019)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_2_1_35_1","volume-title":"International Conference on Machine Learning (ICML). PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning (ICML). PMLR, 8748--8763."},{"key":"e_1_3_2_1_36_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of Machine Learning Research (JMLR), Vol. 21, 140 (2020), 1--67.","journal-title":"Journal of Machine Learning Research (JMLR)"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00985"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.displa.2022.102237"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01020"},{"key":"e_1_3_2_1_42_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Singer Uriel","year":"2022","unstructured":"Uriel Singer, Adam Polyak, Thomas Hayes, Xi Yin, Jie An, Songyang Zhang, Qiyuan Hu, Harry Yang, Oron Ashual, Oran Gafni, et al. 2022. Make-A-Video: Text-to-Video Generation without Text-Video Data. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_43_1","volume-title":"International Conference on Machine Learning (ICML). 2256--2265","author":"Sohl-Dickstein Jascha","year":"2015","unstructured":"Jascha Sohl-Dickstein, Eric A. Weiss, Niru Maheswaranathan, and Surya Ganguli. 2015. Deep Unsupervised Learning using Nonequilibrium Thermodynamics. In International Conference on Machine Learning (ICML). 2256--2265."},{"key":"e_1_3_2_1_44_1","volume-title":"Denoising Diffusion Implicit Models. In International Conference on Learning Representations (ICLR).","author":"Song Jiaming","year":"2020","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2020. Denoising Diffusion Implicit Models. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127063"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109399"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00622"},{"key":"e_1_3_2_1_48_1","volume-title":"Karol Kurach, Raphael Marinier, Marcin Michalski, and Sylvain Gelly.","author":"Unterthiner Thomas","year":"2018","unstructured":"Thomas Unterthiner, Sjoerd Van Steenkiste, Karol Kurach, Raphael Marinier, Marcin Michalski, and Sylvain Gelly. 2018. Towards accurate generative models of video: A new metric & challenges. arXiv preprint arXiv:1812.01717 (2018)."},{"key":"e_1_3_2_1_49_1","volume-title":"FVD: A new metric for video generation.","author":"Unterthiner Thomas","year":"2019","unstructured":"Thomas Unterthiner, Sjoerd van Steenkiste, Karol Kurach, Rapha\u00ebl Marinier, Marcin Michalski, and Sylvain Gelly. 2019. FVD: A new metric for video generation. (2019)."},{"key":"e_1_3_2_1_50_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Conference on Neural Information Processing Systems (NeurIPS), Vol. 30 (2017)."},{"key":"e_1_3_2_1_51_1","volume-title":"Lavie: High-quality video generation with cascaded latent diffusion models. arXiv preprint arXiv:2309.15103","author":"Wang Yaohui","year":"2023","unstructured":"Yaohui Wang, Xinyuan Chen, Xin Ma, Shangchen Zhou, Ziqi Huang, Yi Wang, Ceyuan Yang, Yinan He, Jiashuo Yu, Peiqing Yang, et al. 2023. Lavie: High-quality video generation with cascaded latent diffusion models. arXiv preprint arXiv:2309.15103 (2023)."},{"key":"e_1_3_2_1_52_1","volume-title":"Godiva: Generating open-domain videos from natural descriptions. arXiv preprint arXiv:2104.14806","author":"Wu Chenfei","year":"2021","unstructured":"Chenfei Wu, Lun Huang, Qianxi Zhang, Binyang Li, Lei Ji, Fan Yang, Guillermo Sapiro, and Nan Duan. 2021. Godiva: Generating open-domain videos from natural descriptions. arXiv preprint arXiv:2104.14806 (2021)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"e_1_3_2_1_54_1","volume-title":"Seeing and Hearing: Open-domain Visual-Audio Generation with Diffusion Latent Aligners. arXiv preprint arXiv:2402.17723","author":"Xing Yazhou","year":"2024","unstructured":"Yazhou Xing, Yingqing He, Zeyue Tian, Xintao Wang, and Qifeng Chen. 2024. Seeing and Hearing: Open-domain Visual-Audio Generation with Diffusion Latent Aligners. arXiv preprint arXiv:2402.17723 (2024)."},{"key":"e_1_3_2_1_55_1","volume-title":"Simda: Simple diffusion adapter for efficient video generation. arXiv preprint arXiv:2308.09710","author":"Xing Zhen","year":"2023","unstructured":"Zhen Xing, Qi Dai, Han Hu, Zuxuan Wu, and Yu-Gang Jiang. 2023. Simda: Simple diffusion adapter for efficient video generation. arXiv preprint arXiv:2308.09710 (2023)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_57_1","volume-title":"Diffsound: Discrete diffusion model for text-to-sound generation","author":"Yang Dongchao","year":"2023","unstructured":"Dongchao Yang, Jianwei Yu, Helin Wang, Wen Wang, Chao Weng, Yuexian Zou, and Dong Yu. 2023. Diffsound: Discrete diffusion model for text-to-sound generation. IEEE\/ACM Transactions on Audio, Speech, and Language Processing (IEEE-ACM T AUDIO SPE) (2023)."},{"key":"e_1_3_2_1_58_1","unstructured":"Guy Yariv Itai Gat Sagie Benaim Lior Wolf Idan Schwartz and Yossi Adi. 2023. Diverse and Aligned Audio-to-Video Generation via Text-to-Video Model Adaptation. arxiv: 2309.16429"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01008"},{"key":"e_1_3_2_1_60_1","volume-title":"Jia-Wei Liu, Rui Zhao, Lingmin Ran, Yuchao Gu, Difei Gao, and Mike Zheng Shou.","author":"Zhang David Junhao","year":"2023","unstructured":"David Junhao Zhang, Jay Zhangjie Wu, Jia-Wei Liu, Rui Zhao, Lingmin Ran, Yuchao Gu, Difei Gao, and Mike Zheng Shou. 2023. Show-1: Marrying pixel and latent diffusion models for text-to-video generation. arXiv preprint arXiv:2309.15818 (2023)."},{"key":"e_1_3_2_1_61_1","volume-title":"Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding. In Conference on Empirical Methods in Natural Language Processing (EMNLP). 543--553","author":"Zhang Hang","year":"2023","unstructured":"Hang Zhang, Xin Li, and Lidong Bing. 2023. Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding. In Conference on Empirical Methods in Natural Language Processing (EMNLP). 543--553."},{"key":"e_1_3_2_1_62_1","volume-title":"Label-anticipated Event Disentanglement for Audio-Visual Video Parsing. In European Conference on Computer Vision (ECCV). 1--22","author":"Zhou Jinxing","year":"2024","unstructured":"Jinxing Zhou, Dan Guo, Yuxin Mao, Yiran Zhong, Xiaojun Chang, and Meng Wang. 2024. Label-anticipated Event Disentanglement for Audio-Visual Video Parsing. In European Conference on Computer Vision (ECCV). 1--22."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3223688"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02142-3"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"crossref","unstructured":"Jinxing Zhou Xuyang Shen Jianyuan Wang Jiayi Zhang Weixuan Sun Jing Zhang Stan Birchfield Dan Guo Lingpeng Kong Meng Wang et al. 2023. Audio-visual segmentation with semantics. arXiv preprint arXiv:2301.13190 (2023).","DOI":"10.1007\/s11263-024-02261-x"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_22"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612707"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680612","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680612","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:56Z","timestamp":1750295876000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680612"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":67,"alternative-id":["10.1145\/3664647.3680612","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680612","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}