{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,2]],"date-time":"2025-11-02T19:08:50Z","timestamp":1762110530845,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62371009 &61971008"],"award-info":[{"award-number":["62371009 &61971008"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681070","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"9426-9434","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Instance-Level Panoramic Audio-Visual Saliency Detection and Ranking"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1091-272X","authenticated-orcid":false,"given":"Ruohao","family":"Guo","sequence":"first","affiliation":[{"name":"National Key Laboratory of General Artificial Intelligence, School of Intelligence Science and Technology, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7421-5858","authenticated-orcid":false,"given":"Dantong","family":"Niu","sequence":"additional","affiliation":[{"name":"Berkeley AI Research, University of California, Berkeley, Berkeley, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5228-0348","authenticated-orcid":false,"given":"Liao","family":"Qu","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9931-7855","authenticated-orcid":false,"given":"Yanyu","family":"Qi","sequence":"additional","affiliation":[{"name":"College of Information and Electrical Engineering, China Agricultural University, Beijing, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4584-4569","authenticated-orcid":false,"given":"Ji","family":"Shi","sequence":"additional","affiliation":[{"name":"National Key Laboratory of General Artificial Intelligence, School of Intelligence Science and Technology, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5165-0869","authenticated-orcid":false,"given":"Wenzhen","family":"Yue","sequence":"additional","affiliation":[{"name":"National Key Laboratory of General Artificial Intelligence, School of Intelligence Science and Technology, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3254-3902","authenticated-orcid":false,"given":"Bowei","family":"Xing","sequence":"additional","affiliation":[{"name":"National Key Laboratory of General Artificial Intelligence, School of Intelligence Science and Technology, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9957-9144","authenticated-orcid":false,"given":"Taiyan","family":"Chen","sequence":"additional","affiliation":[{"name":"National Key Laboratory of General Artificial Intelligence, School of Intelligence Science and Technology, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9785-0727","authenticated-orcid":false,"given":"Xianghua","family":"Ying","sequence":"additional","affiliation":[{"name":"National Key Laboratory of General Artificial Intelligence, School of Intelligence Science and Technology, Peking University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Advances in Neural Information Processing Systems","volume":"29","author":"Aytar Yusuf","year":"2016","unstructured":"Yusuf Aytar, Carl Vondrick, and Antonio Torralba. 2016. Soundnet: Learning sound representations from unlabeled video. In Advances in Neural Information Processing Systems, Vol. 29."},{"key":"e_1_3_2_1_2_1","volume-title":"A comprehensive survey on video saliency detection with auditory information: the audio-visual consistency perceptual is the key! IEEE Transactions on Circuits and Systems for Video Technology","author":"Chen Chenglizhao","year":"2022","unstructured":"Chenglizhao Chen, Mengke Song, Wenfeng Song, Li Guo, and Muwei Jian. 2022. A comprehensive survey on video saliency detection with auditory information: the audio-visual consistency perceptual is the key! IEEE Transactions on Circuits and Systems for Video Technology (2022)."},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the International Conference on Machine Learning. 1597--1607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In Proceedings of the International Conference on Machine Learning. 1597--1607."},{"key":"e_1_3_2_1_4_1","volume-title":"Mask2former for video instance segmentation. arXiv preprint arXiv:2112.10764","author":"Cheng Bowen","year":"2021","unstructured":"Bowen Cheng, Anwesa Choudhuri, Ishan Misra, Alexander Kirillov, Rohit Girdhar, and Alexander G Schwing. 2021. Mask2former for video instance segmentation. arXiv preprint arXiv:2112.10764 (2021)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413869"},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the International Conference on Learning Representations.","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, et al. 2021. An image is worth 16x16 words: Transformers for image recognition at scale. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58539-6_42"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6718"},{"key":"e_1_3_2_1_9_1","volume-title":"UniTR: A Unified TRansformer-based Framework for Co-object and Multi-modal Saliency Detection","author":"Guo Ruohao","year":"2024","unstructured":"Ruohao Guo, Xianghua Ying, Yanyu Qi, and Liao Qu. 2024. UniTR: A Unified TRansformer-based Framework for Co-object and Multi-modal Saliency Detection. IEEE Transactions on Multimedia (2024)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2020.3028192"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00488"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.266"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.5244\/C.31.38"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.34"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611724"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00374"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the British Machine Vision Conference.","author":"Mahadevan Sabarinath","year":"2020","unstructured":"Sabarinath Mahadevan, Ali Athar, Sebastian Hennen, Laura Leal-Taix\u00e9, and Bastian Leibe. 2020. Making a case for 3d convolutions for object segmentation in videos. In Proceedings of the British Machine Vision Conference."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1126\/science.6867718"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1152\/jn.1986.56.3.640"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.242"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00943"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.223"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Liao Qu Xianwei Zou Xiang Li Yandong Wen Rita Singh and Bhiksha Raj. 2023. The Hidden Dance of Phonemes and Visage: Unveiling the Enigmatic Link between Phonemes and Facial Features. In Interspeech. 2578--2582.","DOI":"10.21437\/Interspeech.2023-340"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01520"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01215"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01252-6_44"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611812"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01270-0_43"},{"key":"e_1_3_2_1_30_1","volume-title":"Dave: A deep audio-visual embedding for dynamic saliency prediction. arXiv preprint arXiv:1905.10693","author":"Tavakoli Hamed R","year":"2019","unstructured":"Hamed R Tavakoli, Ali Borji, Esa Rahtu, and Juho Kannala. 2019. Dave: A deep audio-visual embedding for dynamic saliency prediction. arXiv preprint arXiv:1905.10693 (2019)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00482"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00054"},{"key":"e_1_3_2_1_33_1","first-page":"5448","article-title":"Bifuse: Self-supervised and efficient bi-projection fusion for 360 depth estimation","volume":"45","author":"Wang Fu-En","year":"2022","unstructured":"Fu-En Wang, Yu-Hsuan Yeh, Yi-Hsuan Tsai, Wei-Chen Chiu, and Min Sun. 2022. Bifuse: Self-supervised and efficient bi-projection fusion for 360 depth estimation. IEEE Transactions on Pattern Analysis and Machine Intelligence 45, 5 (2022), 5448--5460.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00863"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6916"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01304"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00403"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00736"},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of the IEEE International Conference on Computer Vision. 7284--7293","author":"Yan Pengxiang","year":"2019","unstructured":"Pengxiang Yan, Guanbin Li, Yuan Xie, Zhen Li, Chuan Wang, Tianshui Chen, and Liang Lin. 2019. Semi-supervised video salient object detection using pseudolabels. In Proceedings of the IEEE International Conference on Computer Vision. 7284--7293."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01641"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3565267","article-title":"PAVSOD: A New Task towards Panoramic Audiovisual Saliency Detection","volume":"19","author":"Zhang Yi","year":"2023","unstructured":"Yi Zhang, Fang-Yi Chao, Wassim Hamidouche, and Olivier Deforges. 2023. PAVSOD: A New Task towards Panoramic Audiovisual Saliency Detection. ACM Transactions on Multimedia Computing, Communications and Applications 19, 3 (2023), 1--26.","journal-title":"ACM Transactions on Multimedia Computing, Communications and Applications"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_3"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_22"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7008"},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the International Conference on Learning Representations.","author":"Zhu Xizhou","year":"2020","unstructured":"Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, and Jifeng Dai. 2020. Deformable detr: Deformable transformers for end-to-end object detection. In Proceedings of the International Conference on Learning Representations."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681070","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681070","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:52Z","timestamp":1750294672000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681070"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":45,"alternative-id":["10.1145\/3664647.3681070","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681070","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}