{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,17]],"date-time":"2026-01-17T19:08:00Z","timestamp":1768676880895,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"ARC-DECRA","award":["DE230100477"],"award-info":[{"award-number":["DE230100477"]}]},{"name":"ARC-Discovery","award":["DP220100800"],"award-info":[{"award-number":["DP220100800"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612373","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"7590-7598","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":32,"title":["Audio-Visual Segmentation by Exploring Cross-Modal Mutual Semantics"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3159-0034","authenticated-orcid":false,"given":"Chen","family":"Liu","sequence":"first","affiliation":[{"name":"The University of Queensland &amp; University of Technology Sydney, Brisbane, QLD, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1809-2137","authenticated-orcid":false,"given":"Peike Patrick","family":"Li","sequence":"additional","affiliation":[{"name":"Matrix Verse, Sydney, NSW, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9772-5707","authenticated-orcid":false,"given":"Xingqun","family":"Qi","sequence":"additional","affiliation":[{"name":"Netease Fuxi AI Lab, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9892-9515","authenticated-orcid":false,"given":"Hu","family":"Zhang","sequence":"additional","affiliation":[{"name":"The University of Queensland, Brisbane, QLD, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6047-0472","authenticated-orcid":false,"given":"Lincheng","family":"Li","sequence":"additional","affiliation":[{"name":"Netease Fuxi AI Lab, Zhejiang, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0409-2259","authenticated-orcid":false,"given":"Dadong","family":"Wang","sequence":"additional","affiliation":[{"name":"CSIRO DATA61, Sydney, NSW, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0269-5649","authenticated-orcid":false,"given":"Xin","family":"Yu","sequence":"additional","affiliation":[{"name":"The University of Queensland, Brisbane, QLD, Australia"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"End-to-end object detection with transformers","author":"Carion Nicolas","unstructured":"Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. 2020. End-to-end object detection with transformers. In ECCV. Springer, 213--229."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01659"},{"key":"e_1_3_2_1_3_1","volume-title":"Deeplab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs","author":"Chen Liang-Chieh","year":"2017","unstructured":"Liang-Chieh Chen, George Papandreou, Iasonas Kokkinos, Kevin Murphy, and Alan L Yuille. 2017a. Deeplab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs. IEEE transactions on pattern analysis and machine intelligence, Vol. 40, 4 (2017), 834--848."},{"key":"e_1_3_2_1_4_1","volume-title":"Rethinking atrous convolution for semantic image segmentation. arXiv preprint arXiv:1706.05587","author":"Chen Liang-Chieh","year":"2017","unstructured":"Liang-Chieh Chen, George Papandreou, Florian Schroff, and Hartwig Adam. 2017b. Rethinking atrous convolution for semantic image segmentation. arXiv preprint arXiv:1706.05587 (2017)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_49"},{"key":"e_1_3_2_1_6_1","volume-title":"BEATs: Audio Pre-Training with Acoustic Tokenizers. arXiv preprint arXiv:2212.09058","author":"Chen Sanyuan","year":"2022","unstructured":"Sanyuan Chen, Yu Wu, Chengyi Wang, Shujie Liu, Daniel Tompkins, Zhuo Chen, and Furu Wei. 2022. BEATs: Audio Pre-Training with Acoustic Tokenizers. arXiv preprint arXiv:2212.09058 (2022)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"e_1_3_2_1_8_1","first-page":"17864","article-title":"Per-pixel classification is not all you need for semantic segmentation","volume":"34","author":"Cheng Bowen","year":"2021","unstructured":"Bowen Cheng, Alex Schwing, and Alexander Kirillov. 2021. Per-pixel classification is not all you need for semantic segmentation. Advances in Neural Information Processing Systems, Vol. 34 (2021), 17864--17875.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_9_1","volume-title":"Christopher KI Williams, John Winn, and Andrew Zisserman.","author":"Everingham Mark","year":"2010","unstructured":"Mark Everingham, Luc Van Gool, Christopher KI Williams, John Winn, and Andrew Zisserman. 2010. The pascal visual object classes (voc) challenge. International journal of computer vision, Vol. 88 (2010), 303--338."},{"key":"e_1_3_2_1_10_1","volume-title":"Sne-roadseg: Incorporating surface normal information into semantic segmentation for accurate freespace detection","author":"Fan Rui","year":"2020","unstructured":"Rui Fan, Hengli Wang, Peide Cai, and Ming Liu. 2020. Sne-roadseg: Incorporating surface normal information into semantic segmentation for accurate freespace detection. In ECCV. Springer, 340--356."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICMLA.2019.00066"},{"key":"e_1_3_2_1_14_1","first-page":"10077","article-title":"Discriminative sounding objects localization via self-supervised audiovisual matching","volume":"33","author":"Hu Di","year":"2020","unstructured":"Di Hu, Rui Qian, Minyue Jiang, Xiao Tan, Shilei Wen, Errui Ding, Weiyao Lin, and Dejing Dou. 2020. Discriminative sounding objects localization via self-supervised audiovisual matching. Advances in Neural Information Processing Systems, Vol. 33 (2020), 10077--10087.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00963"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Tsung-Yi Lin Michael Maire Serge Belongie James Hays Pietro Perona Deva Ramanan Piotr Doll\u00e1r and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In ECCV. 740--755.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548317"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20073"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00695"},{"key":"e_1_3_2_1_22_1","volume-title":"V-net: Fully convolutional neural networks for volumetric medical image segmentation. In 2016 fourth international conference on 3D vision (3DV). Ieee, 565--571.","author":"Milletari Fausto","year":"2016","unstructured":"Fausto Milletari, Nassir Navab, and Seyed-Ahmad Ahmadi. 2016. V-net: Fully convolutional neural networks for volumetric medical image segmentation. In 2016 fourth international conference on 3D vision (3DV). Ieee, 565--571."},{"key":"e_1_3_2_1_23_1","volume-title":"A Closer Look at Weakly-Supervised Audio-Visual Source Localization. arXiv preprint arXiv:2209.09634","author":"Mo Shentong","year":"2022","unstructured":"Shentong Mo and Pedro Morgado. 2022a. A Closer Look at Weakly-Supervised Audio-Visual Source Localization. arXiv preprint arXiv:2209.09634 (2022)."},{"key":"e_1_3_2_1_24_1","volume-title":"Localizing visual sounds the easy way","author":"Mo Shentong","unstructured":"Shentong Mo and Pedro Morgado. 2022b. Localizing visual sounds the easy way. In ECCV. Springer, 218--234."},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of the Asian Conference on Computer Vision.","author":"Oya Takashi","year":"2020","unstructured":"Takashi Oya, Shohei Iwase, Ryota Natsume, Takahiro Itazuri, Shugo Yamaguchi, and Shigeo Morishima. 2020. Do we need sound for sound source localization?. In Proceedings of the Asian Conference on Computer Vision."},{"key":"e_1_3_2_1_26_1","unstructured":"Xingqun Qi Chen Liu Lincheng Li Jie Hou Haoran Xin and Xin Yu. 2023 a. EmotionGesture: Audio-Driven Diverse Emotional Co-Speech 3D Gesture Generation. arxiv: 2305.18891 [cs.CV]"},{"key":"e_1_3_2_1_27_1","volume-title":"2023 b. Diverse 3D Hand Gesture Prediction from Body Dynamics by Bilateral Hand Disentanglement. arXiv preprint arXiv:2303.01765","author":"Qi Xingqun","year":"2023","unstructured":"Xingqun Qi, Chen Liu, Muyi Sun, Lincheng Li, Changjie Fan, and Xin Yu. 2023 b. Diverse 3D Hand Gesture Prediction from Body Dynamics by Bilateral Hand Disentanglement. arXiv preprint arXiv:2303.01765 (2023)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Rui Qian Di Hu Heinrich Dinkel Mengyue Wu Ning Xu and Weiyao Lin. 2020. Multiple sound sources localization from coarse to fine. In ECCV. 292--308.","DOI":"10.1007\/978-3-030-58565-5_18"},{"key":"e_1_3_2_1_29_1","volume-title":"U-net: Convolutional networks for biomedical image segmentation. In Medical Image Computing and Computer-Assisted Intervention-MICCAI 2015: 18th International Conference","author":"Ronneberger Olaf","year":"2015","unstructured":"Olaf Ronneberger, Philipp Fischer, and Thomas Brox. 2015. U-net: Convolutional networks for biomedical image segmentation. In Medical Image Computing and Computer-Assisted Intervention-MICCAI 2015: 18th International Conference, Munich, Germany, October 5-9, 2015, Proceedings, Part III 18. Springer, 234--241."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.3390\/jzbg2020011"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00458"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747867"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00065"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00222"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475555"},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 3222--3231","author":"Song Zengjie","year":"2022","unstructured":"Zengjie Song, Yuxi Wang, Junsong Fan, Tieniu Tan, and Zhaoxiang Zhang. 2022. Self-supervised predictive learning: A negative-free method for sound source localization in visual scenes. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 3222--3231."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00717"},{"key":"e_1_3_2_1_38_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Jinghua Wang Zhenhua Wang Dacheng Tao Simon See and Gang Wang. 2016. Learning common and specific features for RGB-D semantic segmentation with deconvolutional networks. In ECCV. 664--679.","DOI":"10.1007\/978-3-319-46454-1_40"},{"key":"e_1_3_2_1_40_1","volume-title":"Catch me if you hear me: Audio-visual navigation in complex unmapped environments with moving sounds","author":"Younes Abdelrahman","year":"2023","unstructured":"Abdelrahman Younes, Daniel Honerkamp, Tim Welschehold, and Abhinav Valada. 2023. Catch me if you hear me: Audio-visual navigation in complex unmapped environments with moving sounds. IEEE Robotics and Automation Letters (2023)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.3390\/s22124324"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_35"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.660"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.544"},{"key":"e_1_3_2_1_45_1","volume-title":"Contrastive positive sample propagation along the audio-visual event line","author":"Zhou Jinxing","year":"2022","unstructured":"Jinxing Zhou, Dan Guo, and Meng Wang. 2022a. Contrastive positive sample propagation along the audio-visual event line. IEEE Transactions on Pattern Analysis and Machine Intelligence (2022)."},{"key":"e_1_3_2_1_46_1","volume-title":"Audio-Visual Segmentation","author":"Zhou Jinxing","unstructured":"Jinxing Zhou, Jianyuan Wang, Jiayi Zhang, Weixuan Sun, Jing Zhang, Stan Birchfield, Dan Guo, Lingpeng Kong, Meng Wang, and Yiran Zhong. 2022b. Audio-Visual Segmentation. In ECCV. Springer, 386--403."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00517"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00514"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612373","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612373","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:54:09Z","timestamp":1755820449000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612373"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":48,"alternative-id":["10.1145\/3581783.3612373","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612373","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}