{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T17:43:12Z","timestamp":1772905392742,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Natural Science Foundation of China","award":["U19B2043 61976185"],"award-info":[{"award-number":["U19B2043 61976185"]}]},{"name":"Fundamental Research Funds for the Central Universities","award":["226-2023-00048"],"award-info":[{"award-number":["226-2023-00048"]}]},{"name":"National Key Research & Development Project of China","award":["2021ZD0110700"],"award-info":[{"award-number":["2021ZD0110700"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611724","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"1485-1494","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":56,"title":["CATR: Combinatorial-Dependence Audio-Queried Transformer for Audio-Visual Video Segmentation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-4479-5920","authenticated-orcid":false,"given":"Kexin","family":"Li","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8783-8313","authenticated-orcid":false,"given":"Zongxin","family":"Yang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4912-3293","authenticated-orcid":false,"given":"Lei","family":"Chen","sequence":"additional","affiliation":[{"name":"Finvolution Group, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0512-880X","authenticated-orcid":false,"given":"Yi","family":"Yang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6142-9914","authenticated-orcid":false,"given":"Jun","family":"Xiao","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01659"},{"key":"e_1_3_2_2_3_1","volume-title":"Deeplab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs","author":"Chen Liang-Chieh","year":"2017","unstructured":"Liang-Chieh Chen, George Papandreou, Iasonas Kokkinos, Kevin Murphy, and Alan L Yuille. 2017. Deeplab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs. IEEE transactions on pattern analysis and machine intelligence 40, 4 (2017), 834--848."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00803"},{"key":"e_1_3_2_2_5_1","volume-title":"Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021","author":"Cheng Bowen","year":"2021","unstructured":"Bowen Cheng, Alexander G. Schwing, and Alexander Kirillov. 2021. Per-Pixel Classification is Not All You Need for Semantic Segmentation. In Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual, Marc'Aurelio Ranzato, Alina Beygelzimer, Yann N. Dauphin, Percy Liang, and Jennifer Wortman Vaughan (Eds.). 17864--17875. https:\/\/proceedings.neurips.cc\/paper\/2021\/ hash\/950a4152c2b4aa3ad78bdd6b366cc179-Abstract.html"},{"key":"e_1_3_2_2_6_1","volume-title":"Segment and track anything. arXiv preprint arXiv:2305.06558","author":"Cheng Yangming","year":"2023","unstructured":"Yangming Cheng, Liulei Li, Yuanyou Xu, Xiaodi Li, Zongxin Yang, Wenguan Wang, and Yi Yang. 2023. Segment and track anything. arXiv preprint arXiv:2305.06558 (2023)."},{"key":"e_1_3_2_2_7_1","volume-title":"9th International Conference on Learning Representations, ICLR 2021","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3-7, 2021. OpenReview.net. https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00406"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00585"},{"key":"e_1_3_2_2_10_1","volume-title":"Christopher KI Williams, John Winn, and Andrew Zisserman.","author":"Everingham Mark","year":"2010","unstructured":"Mark Everingham, Luc Van Gool, Christopher KI Williams, John Winn, and Andrew Zisserman. 2010. The pascal visual object classes (voc) challenge. International journal of computer vision 88 (2010), 303--338."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"crossref","unstructured":"Alexander Kirillov Eric Mintun Nikhila Ravi Hanzi Mao Chloe Rolland Laura Gustafson Tete Xiao Spencer Whitehead Alexander C Berg Wan-Yen Lo et al. 2023. Segment anything. arXiv preprint arXiv:2304.02643 (2023).","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01794"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3289753"},{"key":"e_1_3_2_2_17_1","first-page":"31360","article-title":"Gmmseg: Gaussian mixture based generative semantic segmentation models","volume":"35","author":"Liang Chen","year":"2022","unstructured":"Chen Liang, Wenguan Wang, Jiaxu Miao, and Yi Yang. 2022. Gmmseg: Gaussian mixture based generative semantic segmentation models. In Advances in Neural Information Processing Systems, Vol. 35. 31360--31375.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3262578"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.106"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683226"},{"key":"e_1_3_2_2_22_1","first-page":"11449","article-title":"Exploring cross-video and cross-modality signals for weakly-supervised audio-visual video parsing","volume":"34","author":"Lin Yan-Bo","year":"2021","unstructured":"Yan-Bo Lin, Hung-Yu Tseng, Hsin-Ying Lee, Yen-Yu Lin, and Ming-Hsuan Yang. 2021. Exploring cross-video and cross-modality signals for weakly-supervised audio-visual video parsing. Advances in Neural Information Processing Systems 34 (2021), 11449--11461.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_23_1","volume-title":"Proceedings of the Asian Conference on Computer Vision.","author":"Lin Yan-Bo","year":"2020","unstructured":"Yan-Bo Lin and Yu-Chiang Frank Wang. 2020. Audiovisual transformer with instance attention for audio-visual event localization. In Proceedings of the Asian Conference on Computer Vision."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_39"},{"key":"e_1_3_2_2_26_1","volume-title":"Making a case for 3d convolutions for object segmentation in videos. arXiv preprint arXiv:2008.11516","author":"Mahadevan Sabarinath","year":"2020","unstructured":"Sabarinath Mahadevan, Ali Athar, Aljo?a O?ep, Sebastian Hennen, Laura Leal-Taix\u00e9, and Bastian Leibe. 2020. Making a case for 3d convolutions for object segmentation in videos. arXiv preprint arXiv:2008.11516 (2020)."},{"key":"e_1_3_2_2_27_1","volume-title":"Transformer transforms salient object detection and camouflaged object detection. arXiv preprint arXiv:2104.10127","author":"Mao Yuxin","year":"2021","unstructured":"Yuxin Mao, Jing Zhang, Zhexiong Wan, Yuchao Dai, Aixuan Li, Yunqiu Lv, Xinyu Tian, Deng-Ping Fan, and Nick Barnes. 2021. Transformer transforms salient object detection and camouflaged object detection. arXiv preprint arXiv:2104.10127 (2021)."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688"},{"key":"e_1_3_2_2_29_1","volume-title":"V-net: Fully convolutional neural networks for volumetric medical image segmentation. In 2016 fourth international conference on 3D vision (3DV). Ieee, 565--571.","author":"Milletari Fausto","year":"2016","unstructured":"Fausto Milletari, Nassir Navab, and Seyed-Ahmad Ahmadi. 2016. V-net: Fully convolutional neural networks for volumetric medical image segmentation. In 2016 fourth international conference on 3D vision (3DV). Ieee, 565--571."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_18"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093616"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI"},{"key":"e_1_3_2_2_33_1","volume-title":"TransTrack: Multiple-Object Tracking with Transformer. CoRR abs\/2012.15460","author":"Sun Peize","year":"2020","unstructured":"Peize Sun, Yi Jiang, Rufeng Zhang, Enze Xie, Jinkun Cao, Xinting Hu, Tao Kong, Zehuan Yuan, Changhu Wang, and Ping Luo. 2020. TransTrack: Multiple-Object Tracking with Transformer. CoRR abs\/2012.15460 (2020). arXiv:2012.15460 https:\/\/arxiv.org\/abs\/2012.15460"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_26"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"e_1_3_2_2_36_1","volume-title":"Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA, Isabelle Guyon, Ulrike von Luxburg, Samy Bengio, Hanna M. Wallach, Rob Fergus, S. V. N. Vishwanathan, and Roman Garnett (Eds.). 5998--6008. https:\/\/proceedings.neurips.cc\/paper\/2017\/hash\/ 3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"},{"key":"e_1_3_2_2_37_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_2_38_1","volume-title":"A survey on deep learning technique for video segmentation. arXiv e-prints","author":"Wang Wenguan","year":"2021","unstructured":"Wenguan Wang, Tianfei Zhou, Fatih Porikli, David Crandall, and Luc Van Gool. 2021. A survey on deep learning technique for video segmentation. arXiv e-prints (2021), arXiv-2107."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00863"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00492"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00138"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00639"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413581"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i01.5361"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01028"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1631\/FITEE.2100463"},{"key":"e_1_3_2_2_47_1","first-page":"2491","article-title":"Associating objects with transformers for video object segmentation","volume":"34","author":"Yang Zongxin","year":"2021","unstructured":"Zongxin Yang, Yunchao Wei, and Yi Yang. 2021. Associating objects with transformers for video object segmentation. Advances in Neural Information Processing Systems 34 (2021), 2491--2502.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_48_1","first-page":"4701","article-title":"Collaborative video object segmentation by multi-scale foreground-background integration","volume":"44","author":"Yang Zongxin","year":"2021","unstructured":"Zongxin Yang, Yunchao Wei, and Yi Yang 2021. Collaborative video object segmentation by multi-scale foreground-background integration. TPAMI 44, 9 (2021), 4701--4712.","journal-title":"TPAMI"},{"key":"e_1_3_2_2_49_1","unstructured":"Zongxin Yang and Yi Yang. 2022. Decoupling Features in Hierarchical Propagation for Video Object Segmentation. In NeurIPS."},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547869"},{"key":"e_1_3_2_2_51_1","first-page":"15448","article-title":"Learning generative vi-sion transformer with energy-based latent space for saliency prediction","volume":"34","author":"Zhang Jing","year":"2021","unstructured":"Jing Zhang, Jianwen Xie, Nick Barnes, and Ping Li. 2021. Learning generative vi-sion transformer with energy-based latent space for saliency prediction. Advances in Neural Information Processing Systems 34 (2021), 15448--15463.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00223"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"e_1_3_2_2_54_1","unstructured":"Jinxing Zhou Xuyang Shen Jianyuan Wang Jiayi Zhang Weixuan Sun Jing Zhang Stan Birchfield Dan Guo Lingpeng Kong Meng Wang et al. 2023. Audio-Visual Segmentation with Semantics. arXiv preprint arXiv:2301.13190 (2023)."},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_22"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00833"},{"key":"e_1_3_2_2_57_1","volume-title":"Deformable DETR: Deformable Transformers for End-to-End Object Detection. In 9th International Conference on Learning Representations, ICLR 2021","author":"Zhu Xizhou","year":"2021","unstructured":"Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, and Jifeng Dai. 2021. Deformable DETR: Deformable Transformers for End-to-End Object Detection. In 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3-7, 2021. OpenReview.net. https:\/\/openreview.net\/forum? id=gZ9hCDWe6ke"},{"key":"e_1_3_2_2_58_1","volume-title":"Segment everything everywhere all at once. arXiv preprint arXiv:2304.06718","author":"Zou Xueyan","year":"2023","unstructured":"Xueyan Zou, Jianwei Yang, Hao Zhang, Feng Li, Linjie Li, Jianfeng Gao, and Yong Jae Lee. 2023. Segment everything everywhere all at once. arXiv preprint arXiv:2304.06718 (2023)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611724","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611724","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:10:29Z","timestamp":1755821429000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611724"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":58,"alternative-id":["10.1145\/3581783.3611724","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611724","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}