{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T19:04:57Z","timestamp":1778267097200,"version":"3.51.4"},"reference-count":78,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"12","license":[{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62276208"],"award-info":[{"award-number":["62276208"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["12201490"],"award-info":[{"award-number":["12201490"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["12326607"],"award-info":[{"award-number":["12326607"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["12371512"],"award-info":[{"award-number":["12371512"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["12301656"],"award-info":[{"award-number":["12301656"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Postdoctoral Fellowship Program of CPSF","award":["GZB20230581"],"award-info":[{"award-number":["GZB20230581"]}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["xzy012023047"],"award-info":[{"award-number":["xzy012023047"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100021171","name":"Basic and Applied Basic Research Foundation of Guangdong Province","doi-asserted-by":"publisher","award":["2024A1515010919"],"award-info":[{"award-number":["2024A1515010919"]}],"id":[{"id":"10.13039\/501100021171","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2024,12]]},"DOI":"10.1109\/tpami.2024.3444029","type":"journal-article","created":{"date-parts":[[2024,8,15]],"date-time":"2024-08-15T17:35:49Z","timestamp":1723743349000},"page":"10499-10514","source":"Crossref","is-referenced-by-count":2,"title":["Enhancing Sound Source Localization via False Negative Elimination"],"prefix":"10.1109","volume":"46","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7914-3252","authenticated-orcid":false,"given":"Zengjie","family":"Song","sequence":"first","affiliation":[{"name":"School of Mathematics and Statistics, Xi&#x2019;an Jiaotong University, Xi&#x2019;an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8395-1180","authenticated-orcid":false,"given":"Jiangshe","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Mathematics and Statistics, Xi&#x2019;an Jiaotong University, Xi&#x2019;an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1579-2357","authenticated-orcid":false,"given":"Yuxi","family":"Wang","sequence":"additional","affiliation":[{"name":"Center for Artificial Intelligence and Robotics, Hong Kong Institute of Science &amp; Innovation, Chinese Academy of Sciences, Hong Kong SAR, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6989-2711","authenticated-orcid":false,"given":"Junsong","family":"Fan","sequence":"additional","affiliation":[{"name":"Center for Artificial Intelligence and Robotics, Hong Kong Institute of Science &amp; Innovation, Chinese Academy of Sciences, Hong Kong SAR, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2648-3875","authenticated-orcid":false,"given":"Zhaoxiang","family":"Zhang","sequence":"additional","affiliation":[{"name":"New Laboratory of Pattern Recognition, State Key Laboratory of Multimodal Artificial Intelligence Systems, Institute of Automation, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","first-page":"813","article-title":"Audio-Vision: Using audio-visual synchrony to locate sounds","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Hershey"},{"key":"ref2","first-page":"772","article-title":"Learning joint statistical models for audio-visual fusion and segregation","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Fisher III"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00458"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00947"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_18"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58523-5_13"},{"key":"ref8","first-page":"10077","article-title":"Discriminative sounding objects localization via self-supervised audiovisual matching","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Hu"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01659"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00277"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2952095"},{"key":"ref12","first-page":"4733","article-title":"Learning representations from audio-visual spatial alignment","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Morgado"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2022.103602"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747867"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548317"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_35"},{"key":"ref17","first-page":"3222","article-title":"Self-supervised predictive learning: A negative-free method for sound source localization in visual scenes","volume-title":"Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit.","author":"Song"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01229"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00621"},{"key":"ref20","first-page":"4182","article-title":"Data-efficient image recognition with contrastive predictive coding","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"H\u00e9naff"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.5555\/3524938.3525087"},{"key":"ref22","article-title":"Improved baselines with momentum contrastive learning","author":"Chen","year":"2020"},{"key":"ref23","first-page":"22243","article-title":"Big self-supervised models are strong semi-supervised learners","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Chen"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_45"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00331"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16189"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00689"},{"key":"ref29","article-title":"Representation learning with contrastive predictive coding","author":"van den Oord","year":"2018"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00393"},{"key":"ref31","first-page":"12310","article-title":"Barlow Twins: Self-supervised learning via redundancy reduction","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Zbontar"},{"key":"ref32","first-page":"3015","article-title":"Whitening for self-supervised representation learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Ermolov"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.5555\/3495724.3497510"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01549"},{"key":"ref35","first-page":"892","article-title":"SoundNet: Learning sound representations from unlabeled video","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Aytar"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_48"},{"key":"ref37","first-page":"7774","article-title":"Cooperative learning of audio and video models from self-supervised synchronization","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Korbar"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3009820"},{"key":"ref39","first-page":"29258","article-title":"How does it sound? Generation of rhythmic soundtracks for human movement videos","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Su"},{"key":"ref40","article-title":"Curriculum audiovisual learning","author":"Hu","year":"2020"},{"key":"ref41","first-page":"9758","article-title":"Self-supervised learning by cross-modal audio-video clustering","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Alwassel"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01274"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00694"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.274"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2012.2228476"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00715"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01144"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3137988"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00110"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01023"},{"key":"ref52","first-page":"5628","article-title":"A theoretical analysis of contrastive unsupervised representation learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Arora"},{"key":"ref53","first-page":"18661","article-title":"Supervised contrastive learning","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Khosla"},{"key":"ref54","first-page":"8765","article-title":"Debiased contrastive learning","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Chuang"},{"key":"ref55","first-page":"1","article-title":"Prototypical contrastive learning of unsupervised representations","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Li"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00106"},{"key":"ref57","first-page":"1","article-title":"Incremental false negative detection for contrastive learning","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Chen"},{"key":"ref58","first-page":"1","article-title":"Very deep convolutional networks for large-scale image recognition","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Simonyan"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1038\/4580"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1016\/j.bandc.2015.11.003"},{"key":"ref63","first-page":"5266","article-title":"Deep predictive coding network for object recognition","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wen"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2018.2862866"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/tnnls.2023.3288022"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00993"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1023\/B:VISI.0000022288.19776.77"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.7287\/peerj.453v0.1\/reviews\/2"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref72","first-page":"1","article-title":"Decoupled weight decay regularization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Loshchilov"},{"key":"ref73","first-page":"4660","article-title":"Labelling unlabelled videos from scratch with multi-modal self-supervision","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Asano"},{"key":"ref74","first-page":"91","article-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Ren"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00996"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58607-2_16"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1002\/nav.3800020109"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_8"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/10746266\/10637713.pdf?arnumber=10637713","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:27:59Z","timestamp":1732667279000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10637713\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12]]},"references-count":78,"journal-issue":{"issue":"12"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2024.3444029","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12]]}}}