{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:26:42Z","timestamp":1765308402007,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","funder":[{"name":"CCF Baidu Open Fund","award":["CCF-Baidu202416"],"award-info":[{"award-number":["CCF-Baidu202416"]}]},{"name":"National Natural Science Foundation of China","award":["62472025"],"award-info":[{"award-number":["62472025"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755518","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:47:42Z","timestamp":1761371262000},"page":"553-562","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["OV-DAVEL: Towards Open-Vocabulary Dense Audio-Visual Event Localization in Untrimmed Videos"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9335-0151","authenticated-orcid":false,"given":"Jiale","family":"Yu","sequence":"first","affiliation":[{"name":"Beijing Jiaotong University, Haidian District, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2592-2354","authenticated-orcid":false,"given":"Baopeng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Beijing Jiaotong University, Haidian District, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1754-4878","authenticated-orcid":false,"given":"Zhu","family":"Teng","sequence":"additional","affiliation":[{"name":"Beijing Jiaotong University, Haidian District, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2290-1785","authenticated-orcid":false,"given":"Jianping","family":"Fan","sequence":"additional","affiliation":[{"name":"AI Lab at Lenovo Research, Haidian District, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"VATT: Transformers for Multimodal Self- Supervised Learning from Raw Video, Audio and Text. In Advances in Neural Information Processing Systems","author":"Akbari Hassan","year":"2021","unstructured":"Hassan Akbari, Liangzhe Yuan, Rui Qian, Wei-Hong Chuang, Shih-Fu Chang, Yin Cui, and Boqing Gong. 2021. VATT: Transformers for Multimodal Self- Supervised Learning from Raw Video, Audio and Text. In Advances in Neural Information Processing Systems, M. Ranzato, A. Beygelzimer, Y. Dauphin, P.S. Liang, and J. Wortman Vaughan (Eds.), Vol. 34. Curran Associates, Inc., 24206--24221. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2021\/file\/cb3213ada48302953cb0f166464ab356-Paper.pdf"},{"key":"e_1_3_2_1_2_1","volume-title":"Vision and Language. In Proceedings of the 39th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"1312","author":"Baevski Alexei","year":"2022","unstructured":"Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, and Michael Auli. 2022. data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language. In Proceedings of the 39th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 162), Kamalika Chaudhuri, Stefanie Jegelka, Le Song, Csaba Szepesvari, Gang Niu, and Sivan Sabato (Eds.). PMLR, 1298--1312. https:\/\/proceedings.mlr.press\/v162\/baevski22a.html"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01719"},{"key":"e_1_3_2_1_4_1","volume-title":"Wortman Vaughan (Eds.)","volume":"34","author":"Cheng Bowen","year":"2021","unstructured":"Bowen Cheng, Alex Schwing, and Alexander Kirillov. 2021. Per-Pixel Classification is Not All You Need for Semantic Segmentation. In Advances in Neural Information Processing Systems, M. Ranzato, A. Beygelzimer, Y. Dauphin, P.S. Liang, and J. Wortman Vaughan (Eds.), Vol. 34. Curran Associates, Inc., 17864--17875. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2021\/file\/950a4152c2b4aa3ad78bdd6b366cc179-Paper.pdf"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2006.886263"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612506"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02197"},{"key":"e_1_3_2_1_9_1","unstructured":"Tiantian Geng TengWang Yanfu Zhang Jinming Duan Weili Guan Feng Zheng and Ling shao. 2024. UniAV: Unified Audio-Visual Perception for Multi-Task Video Event Localization. arXiv:2404.03179 [cs.CV] https:\/\/arxiv.org\/abs\/2404.03179"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"e_1_3_2_1_11_1","unstructured":"Xiuye Gu Tsung-Yi Lin Weicheng Kuo and Yin Cui. 2022. Openvocabulary Object Detection via Vision and Language Knowledge Distillation. arXiv:2104.13921 [cs.CV] https:\/\/arxiv.org\/abs\/2104.13921"},{"key":"e_1_3_2_1_12_1","volume-title":"Text and Audio. In ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 976--980","author":"Guzhov Andrey","year":"2022","unstructured":"Andrey Guzhov, Federico Raue, J\u00f6rn Hees, and Andreas Dengel. 2022. Audioclip: Extending Clip to Image, Text and Audio. In ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 976--980. doi:10. 1109\/ICASSP43922.2022.9747631"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling Up Visual and Vision- Language Representation Learning With Noisy Text Supervision. In Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 139), Marina Meila and Tong Zhang (Eds.). PMLR, 4904--4916."},{"key":"e_1_3_2_1_15_1","volume-title":"andWeidi Xie","author":"Ju Chen","year":"2022","unstructured":"Chen Ju, Tengda Han, Kunhao Zheng, Ya Zhang, andWeidi Xie. 2022. Prompting Visual-Language Models for Efficient Video Understanding. In Computer Vision -- ECCV 2022, Shai Avidan, Gabriel Brostow, Moustapha Ciss\u00e9, Giovanni Maria Farinella, and Tal Hassner (Eds.). Springer Nature Switzerland, Cham, 105--124."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.113"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02516"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3395778"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683226"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3-031--20062--5_39"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 139), Marina Meila and Tong Zhang (Eds.). PMLR, 8748--8763. https:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"Schuhmann Christoph","year":"2022","unstructured":"Christoph Schuhmann, Romain Beaumont, Richard Vencu, Cade Gordon, Ross Wightman, Mehdi Cherti, Theo Coombes, Aarush Katta, Clayton Mullis, Mitchell Wortsman, Patrick Schramowski, Srivatsa Kundurthy, Katherine Crowson, Ludwig Schmidt, Robert Kaczmarczyk, and Jenia Jitsev. 2022. LAION-5B: an open large-scale dataset for training next generation image-text models. In Proceedings of the 36th International Conference on Neural Information Processing Systems (New Orleans, LA, USA) (NIPS '22). Curran Associates Inc., Red Hook, NY, USA, Article 1833, 17 pages."},{"key":"e_1_3_2_1_24_1","unstructured":"Yunlong Tang Daiki Shimada Jing Bi Mingqian Feng Hang Hua and Chenliang Xu. 2024. Empowering LLMs with Pseudo-Untrimmed Videos for Audio-Visual Temporal Understanding. arXiv:2403.16276 [cs.CV] https:\/\/arxiv.org\/abs\/2403. 16276"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00622"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","unstructured":"Yusong Wu Ke Chen Tianyu Zhang Yuchen Hui Taylor Berg-Kirkpatrick and Shlomo Dubnov. 2023. Large-Scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation. In ICASSP 2023 - 2023 IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP). 1--5. doi:10.1109\/ICASSP49357.2023.10095969","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01936"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Ling Xing Hongyu Qu Rui Yan Xiangbo Shu and Jinhui Tang. 2024. Locality aware Cross-modal Correspondence Learning for Dense Audio-Visual Events Localization. arXiv:2409.07967 [cs.CV] https:\/\/arxiv.org\/abs\/2409.07967","DOI":"10.1109\/TCSVT.2025.3629609"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19772-7_29"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01340"},{"key":"e_1_3_2_1_34_1","unstructured":"Jinxing Zhou Dan Guo Ruohao Guo Yuxin Mao Jingjing Hu Yiran Zhong Xiaojun Chang and Meng Wang. 2024. Towards Open-Vocabulary Audio-Visual Event Localization. arXiv:2411.11278 [cs.CV] https:\/\/arxiv.org\/abs\/2411.11278"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00833"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681266"},{"key":"e_1_3_2_1_37_1","unstructured":"Ziheng Zhou Jinxing Zhou Wei Qian Shengeng Tang Xiaojun Chang and Dan Guo. 2024. Dense Audio-Visual Event Localization under Cross-Modal Consistency and Multi-Temporal Granularity Collaboration. arXiv:2412.12628 [cs.CV] https:\/\/arxiv.org\/abs\/2412.12628"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2023.3275873"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755518","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:23:01Z","timestamp":1765308181000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755518"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":38,"alternative-id":["10.1145\/3746027.3755518","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755518","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}