{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:21:08Z","timestamp":1777656068658,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62371009 &61971008"],"award-info":[{"award-number":["62371009 &61971008"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681586","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"7533-7541","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["Open-Vocabulary Audio-Visual Semantic Segmentation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1091-272X","authenticated-orcid":false,"given":"Ruohao","family":"Guo","sequence":"first","affiliation":[{"name":"National Key Laboratory of General Artificial Intelligence, School of Intelligence Science and Technology, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5228-0348","authenticated-orcid":false,"given":"Liao","family":"Qu","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7421-5858","authenticated-orcid":false,"given":"Dantong","family":"Niu","sequence":"additional","affiliation":[{"name":"Berkeley AI Research, University of California, Berkeley, Berkeley, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9931-7855","authenticated-orcid":false,"given":"Yanyu","family":"Qi","sequence":"additional","affiliation":[{"name":"College of Information and Electrical Engineering, China Agricultural University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5165-0869","authenticated-orcid":false,"given":"Wenzhen","family":"Yue","sequence":"additional","affiliation":[{"name":"National Key Laboratory of General Artificial Intelligence, School of Intelligence Science and Technology, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4584-4569","authenticated-orcid":false,"given":"Ji","family":"Shi","sequence":"additional","affiliation":[{"name":"National Key Laboratory of General Artificial Intelligence, School of Intelligence Science and Technology, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3254-3902","authenticated-orcid":false,"given":"Bowei","family":"Xing","sequence":"additional","affiliation":[{"name":"National Key Laboratory of General Artificial Intelligence, School of Intelligence Science and Technology, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9785-0727","authenticated-orcid":false,"given":"Xianghua","family":"Ying","sequence":"additional","affiliation":[{"name":"National Key Laboratory of General Artificial Intelligence, School of Intelligence Science and Technology, Peking University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"e_1_3_2_1_3_1","first-page":"17864","article-title":"Per-pixel classification is not all you need for semantic segmentation","volume":"34","author":"Cheng Bowen","year":"2021","unstructured":"Bowen Cheng, Alex Schwing, and Alexander Kirillov. 2021. Per-pixel classification is not all you need for semantic segmentation. Advances in Neural Information Processing Systems 34 (2021), 17864--17875.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the International Conference on Learning Representations.","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, et al. 2021. An image is worth 16x16 words: Transformers for image recognition at scale. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i11.29104"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the International Conference on Learning Representations.","author":"Gu Xiuye","year":"2022","unstructured":"Xiuye Gu, Tsung-Yi Lin, Weicheng Kuo, and Yin Cui. 2022. Open-vocabulary object detection via vision and language knowledge distillation. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_8_1","volume-title":"OpenVIS: Open-vocabulary video instance segmentation. arXiv preprint arXiv:2305.16835","author":"Guo Pinxue","year":"2023","unstructured":"Pinxue Guo, Tony Huang, Peiyang He, Xuefeng Liu, Tianjun Xiao, Zhaoyu Chen, and Wenqiang Zhang. 2023. OpenVIS: Open-vocabulary video instance segmentation. arXiv preprint arXiv:2305.16835 (2023)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00707"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00550"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the International Conference on Machine Learning. 2790--2799","author":"Houlsby Neil","year":"2019","unstructured":"Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin De Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. 2019. Parameter-efficient transfer learning for NLP. In Proceedings of the International Conference on Machine Learning. 2790--2799."},{"key":"e_1_3_2_1_13_1","volume-title":"Segment Anything. In Proceedings of the IEEE International Conference on Computer Vision. 3992--4003","author":"Kirillov Alexander","unstructured":"Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alexander C. Berg, Wan-Yen Lo, Piotr Doll\u00e1r, and Ross B. Girshick. 2023. Segment Anything. In Proceedings of the IEEE International Conference on Computer Vision. 3992--4003."},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Li Boyi","year":"2022","unstructured":"Boyi Li, Kilian Q Weinberger, Serge Belongie, Vladlen Koltun, and Ren\u00e9 Ranftl. 2022. Language-driven semantic segmentation. Proceedings of the International Conference on Learning Representations (2022)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611724"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00311"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612373"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the Conference on European Conference on Computer Vision.","author":"Liu Shilong","year":"2024","unstructured":"Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chun yue Li, Jianwei Yang, Hang Su, Jun-Juan Zhu, and Lei Zhang. 2024. Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection. In Proceedings of the Conference on European Conference on Computer Vision."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the International Conference on Learning Representations.","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. DecoupledWeight Decay Regularization. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00094"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 19446--19455","author":"Qin Jie","year":"2023","unstructured":"Jie Qin, Jie Wu, Pengxiang Yan, Ming Li, Ren Yuxi, Xuefeng Xiao, Yitong Wang, Rui Wang, Shilei Wen, Xin Pan, et al. 2023. Freeseg: Unified, universal and open-vocabulary image segmentation. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 19446--19455."},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the International Conference on Machine Learning. 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In Proceedings of the International Conference on Machine Learning. 8748--8763."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00375"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28378"},{"key":"e_1_3_2_1_26_1","unstructured":"Jianzong Wu Xiangtai Li Shilin Xu Haobo Yuan Henghui Ding Yibo Yang Xia Li Jiangning Zhang Yunhai Tong Xudong Jiang et al. 2024. Towards open vocabulary learning: A survey. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing. 1--5.","author":"Wu Yusong","year":"2022","unstructured":"Yusong Wu, K. Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, and Shlomo Dubnov. 2022. Large-Scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation. In Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing. 1--5."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19818-2_42"},{"key":"e_1_3_2_1_29_1","unstructured":"Mengde Xu Zheng Zhang Fangyun Wei Yutong Lin Yue Cao Han Hu Xiang Bai et al. 2021. A simple baseline for zero-shot semantic segmentation with pre-trained vision-language model. arXiv preprint arXiv:2112.14757 3 (2021) 2."},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings of the British Machine Vision Conference.","author":"Yu Jiarui","year":"2023","unstructured":"Jiarui Yu, Haoran Li, Yanbin Hao, Jinmeng Wu, Tong Xu, Shuo Wang, and Xiangnan He. 2023. How Can Contrastive Pre-training Benefit Audio-Visual Segmentation? A Study from Supervised and Zero-shot Perspectives. In Proceedings of the British Machine Vision Conference."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_7"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Jinxing Zhou Xuyang Shen Jianyuan Wang Jiayi Zhang Weixuan Sun Jing Zhang Stan Birchfield Dan Guo Lingpeng Kong MengWang et al. 2023. Audiovisual segmentation with semantics. arXiv preprint arXiv:2301.13190 (2023).","DOI":"10.1007\/s11263-024-02261-x"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_22"},{"key":"e_1_3_2_1_35_1","volume-title":"CLIPVIS: Adapting CLIP for Open-Vocabulary Video Instance Segmentation. arXiv preprint arXiv:2403.12455","author":"Zhu Wenqi","year":"2024","unstructured":"Wenqi Zhu, Jiale Cao, Jin Xie, Shuangming Yang, and Yanwei Pang. 2024. CLIPVIS: Adapting CLIP for Open-Vocabulary Video Instance Segmentation. arXiv preprint arXiv:2403.12455 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of the International Conference on Learning Representations.","author":"Zhu Xizhou","year":"2020","unstructured":"Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, and Jifeng Dai. 2020. Deformable detr: Deformable transformers for end-to-end object detection. In Proceedings of the International Conference on Learning Representations."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681586","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681586","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:49Z","timestamp":1750295869000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681586"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":36,"alternative-id":["10.1145\/3664647.3681586","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681586","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}