{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,4]],"date-time":"2026-07-04T07:34:57Z","timestamp":1783150497540,"version":"3.54.6"},"reference-count":161,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","license":[{"start":{"date-parts":[[2023,2,1]],"date-time":"2023-02-01T00:00:00Z","timestamp":1675209600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62172246"],"award-info":[{"award-number":["62172246"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61976123"],"award-info":[{"award-number":["61976123"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100011160","name":"Open Project Program of the State Key Laboratory of Virtual Reality Technology and Systems","doi-asserted-by":"publisher","award":["VRLAB2021A05"],"award-info":[{"award-number":["VRLAB2021A05"]}],"id":[{"id":"10.13039\/501100011160","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Youth Innovation and Technology Support Plan of Colleges and Universities in Shandong Province","award":["2021KJ062"],"award-info":[{"award-number":["2021KJ062"]}]},{"name":"Taishan Young Scholars Program of Shandong Province"},{"name":"Key Development Program for Basic Research of Shandong Province","award":["ZR2020ZD44"],"award-info":[{"award-number":["ZR2020ZD44"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Circuits Syst. Video Technol."],"published-print":{"date-parts":[[2023,2]]},"DOI":"10.1109\/tcsvt.2022.3203421","type":"journal-article","created":{"date-parts":[[2022,9,1]],"date-time":"2022-09-01T19:39:56Z","timestamp":1662061196000},"page":"457-477","source":"Crossref","is-referenced-by-count":41,"title":["A Comprehensive Survey on Video Saliency Detection With Auditory Information: The Audio-Visual Consistency Perceptual is the Key!"],"prefix":"10.1109","volume":"33","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9982-5667","authenticated-orcid":false,"given":"Chenglizhao","family":"Chen","sequence":"first","affiliation":[{"name":"College of Computer Science and Technology, Qingdao Institute of Software, China University of Petroleum (East China), Qingdao, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mengke","family":"Song","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, Qingdao Institute of Software, China University of Petroleum (East China), Qingdao, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wenfeng","family":"Song","sequence":"additional","affiliation":[{"name":"Computer School, Beijing Information Science and Technology University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Li","family":"Guo","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, Qingdao University, Qingdao, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4249-2264","authenticated-orcid":false,"given":"Muwei","family":"Jian","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Shandong University of Finance and Economics, Jinan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3052069"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2968250"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3068649"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2014.2356200"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2839523"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2017.2775212"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2670143"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2015.09.033"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3095843"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2934350"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2940851"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2021.08.069"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2015.2459017"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2798607"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2018.2870832"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2924417"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-021-1293-0"},{"key":"ref18","first-page":"1","article-title":"Objects that sound","volume-title":"Proc. ECCV","author":"Arandjelovi"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2952095"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"ref21","article-title":"Contrastive learning of global-local video representations","author":"Ma","year":"2021","journal-title":"arXiv:2104.05418"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414296"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3129331"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/taffc.2021.3111110"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054768"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-95388-1_33"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2020.2989158"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20017"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20009"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00866"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2936112"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20011"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20200"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00061"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP46576.2022.9897762"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01655"},{"key":"ref37","article-title":"Video salient object detection via adaptive local-global refinement","author":"Tang","year":"2021","journal-title":"arXiv:2104.14360"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP42928.2021.9506492"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475192"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00737"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_13"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00158"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00488"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01252-6_44"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00875"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6718"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3068644"},{"key":"ref48","article-title":"Beyond pixels: Exploring new representation and applications for motion analysis","author":"Liu","year":"2009"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref50","first-page":"1","article-title":"Convolutional LSTM network: A machine learning approach for precipitation nowcasting","volume-title":"Proc. NIPS","author":"Shi"},{"key":"ref51","article-title":"Learning spatiotemporal features with 3D convolutional networks","author":"Tran","year":"2014","journal-title":"arXiv:1412.0767"},{"key":"ref52","article-title":"A unified transformer framework for group-based segmentation: Co-segmentation, co-saliency detection and video salient object detection","author":"Su","year":"2022","journal-title":"arXiv:2203.04708"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2022.06.006"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2777665"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.316"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.179"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3036749"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00514"},{"key":"ref59","article-title":"Simple vs complex temporal recurrences for video saliency prediction","author":"Linardos","year":"2019","journal-title":"arXiv:1907.01869"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107615"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_25"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_37"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6927"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2851672"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00783"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2021.04.010"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00248"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01519-y"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2021.104216"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/tcsvt.2022.3172971"},{"key":"ref71","article-title":"Very deep convolutional networks for large-scale image recognition","volume-title":"Proc. ICLR","author":"Simonyan"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413869"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00715"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01487"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1007\/s10772-021-09944-7"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.3390\/app12010527"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3061800"},{"key":"ref79","first-page":"892","article-title":"SoundNet: Learning sound representations from unlabeled video","volume-title":"Proc. NIPS","author":"Aytar"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-74048-3"},{"key":"ref82","first-page":"9758","article-title":"Self-supervised learning by cross-modal audio-video clustering","volume-title":"Proc. NIPS","author":"Alwassel"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00639"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_26"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01047"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00374"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00398"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01229"},{"key":"ref90","article-title":"Unsupervised sound localization via iterative contrastive learning","author":"Lin","year":"2021","journal-title":"arXiv:2104.00315"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683133"},{"key":"ref92","article-title":"Themes informed audio-visual correspondence learning","author":"Su","year":"2020","journal-title":"arXiv:2009.06573"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3056223"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2021.3084613"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3128214"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.2994524"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00879"},{"key":"ref98","first-page":"1","article-title":"Disjoint mapping network for cross-modal matching of voices and faces","volume-title":"Proc. ICLR","author":"Weny"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/ICMEW.2019.00-70"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_5"},{"key":"ref101","article-title":"Putting a face to the voice: Fusing audio and visual signals across a video to determine speakers","author":"Hoover","year":"2017","journal-title":"arXiv:1706.00079"},{"key":"ref102","first-page":"1","article-title":"Cross-modal embeddings for video and audio retrieval","volume-title":"Proc. ECCVW","author":"Sur\u00eds"},{"key":"ref103","first-page":"813","article-title":"Audio-vision: Using audio-visual synchrony to locate sounds","volume-title":"Proc. NIPS","author":"Hershey"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00182"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2012.2228476"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.2307\/2333955"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.274"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1145\/1631272.1631344"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-53842-1_40"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/ROBOT.2008.4543329"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2011.6095124"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/IROS51168.2021.9636585"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2021.3137988"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.107906"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01659"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00458"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2022.3143882"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/TCDS.2020.3025947"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1016\/S0042-6989(99)00163-7"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1016\/j.cub.2005.09.040"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6638898"},{"key":"ref123","first-page":"1","article-title":"Auditory saliency using natural statistics","volume-title":"Proc. Annu. Meeting Cogn. Sci. Soc. (PAMCSS)","author":"Tsuchida"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2018.2882055"},{"key":"ref125","first-page":"1294","article-title":"A saliency-based approach to audio event detection and summarization","volume-title":"Proc. 20th Eur. Signal Process. Conf. (EUSIPCO)","author":"Zlatintsi"},{"key":"ref126","first-page":"3553","article-title":"Video event detection and summarization using audio","volume-title":"Proc. ICASSP","author":"Evangelopoulos"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/CISS.2012.6310945"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2012.6288368"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2013.2267205"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/EUSIPCO.2015.7362797"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683586"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/CISS.2013.6552285"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.2352\/ISSN.2470-1173.2016.13.IQSP-217"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/ICME51207.2021.9428415"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP42928.2021.9506089"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1109\/VCIP49819.2020.9301766"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1109\/EBCCSP.2015.7300674"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2014.7025219"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2016.08.130"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1109\/QoMEX.2014.6982312"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1145\/2996463"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2966082"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1016\/j.image.2019.05.001"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1163\/22134808-00002417"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1109\/TAMD.2009.2021091"},{"key":"ref146","article-title":"DAVE: A deep audio-visual embedding for dynamic saliency prediction","author":"Tavakoli","year":"2020","journal-title":"arXiv:1905.10693"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00482"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1109\/IROS51168.2021.9635989"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.12.011"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1007\/s12559-010-9074-z"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1109\/WIAMIS.2013.6616164"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1167\/14.8.5"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10584-0_33"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1016\/j.image.2015.08.004"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1109\/34.730558"},{"key":"ref156","first-page":"545","article-title":"Graph-based visual saliency","volume-title":"Proc. NIPS","volume":"8","author":"Jonathan"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.436"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2567391"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00097"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.404"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00623"}],"container-title":["IEEE Transactions on Circuits and Systems for Video Technology"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/76\/10036154\/09874810.pdf?arnumber=9874810","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,2]],"date-time":"2024-03-02T06:16:49Z","timestamp":1709360209000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9874810\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,2]]},"references-count":161,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/tcsvt.2022.3203421","relation":{},"ISSN":["1051-8215","1558-2205"],"issn-type":[{"value":"1051-8215","type":"print"},{"value":"1558-2205","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,2]]}}}