{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,25]],"date-time":"2026-06-25T16:25:29Z","timestamp":1782404729910,"version":"3.54.5"},"reference-count":68,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"9","license":[{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1109\/tpami.2025.3573994","type":"journal-article","created":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T13:15:53Z","timestamp":1748351753000},"page":"7643-7659","source":"Crossref","is-referenced-by-count":2,"title":["Toward Interactive Sound Source Localization: Better Align Sight and Sound!"],"prefix":"10.1109","volume":"47","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9141-3270","authenticated-orcid":false,"given":"Arda","family":"Senocak","sequence":"first","affiliation":[{"name":"School of Electrical Engineering, KAIST, Daejeon, South Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5638-1832","authenticated-orcid":false,"given":"Hyeonggon","family":"Ryu","sequence":"additional","affiliation":[{"name":"School of Electrical Engineering, KAIST, Daejeon, South Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2555-5232","authenticated-orcid":false,"given":"Junsik","family":"Kim","sequence":"additional","affiliation":[{"name":"School of Engineering and Applied Sciences, Harvard University, Boston, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0468-1571","authenticated-orcid":false,"given":"Tae-Hyun","family":"Oh","sequence":"additional","affiliation":[{"name":"Department of Electrical Engineering, POSTECH, Pohang, South Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3620-2582","authenticated-orcid":false,"given":"Hanspeter","family":"Pfister","sequence":"additional","affiliation":[{"name":"School of Engineering and Applied Sciences, Harvard University, Boston, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7741-7275","authenticated-orcid":false,"given":"Joon Son","family":"Chung","sequence":"additional","affiliation":[{"name":"School of Electrical Engineering, KAIST, Daejeon, South Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"ref3","first-page":"892","article-title":"SoundNet: Learning sound representations from unlabeled video","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Aytar"},{"key":"ref4","article-title":"Mine your own view: Self-supervised learning through across-sample prediction","author":"Azabou","year":"2021"},{"key":"ref5","article-title":"Sentence-level prompts benefit composed image retrieval","author":"Bai","year":"2023"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01407"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01659"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.5555\/3524938.3525087"},{"key":"ref10","article-title":"Improved baselines with momentum contrastive learning","author":"Chen","year":"2020"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01549"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.202"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-54427-4_19"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1113"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00945"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01841"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00231"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"ref21","first-page":"6704","article-title":"Cyclip: Cyclic contrastive language-image pretraining","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Goel"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01246"},{"key":"ref23","first-page":"5679","article-title":"Self-supervised co-training for video representation learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Han"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24261-3_7"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00947"},{"key":"ref27","first-page":"10077","article-title":"Discriminative sounding objects localization via self-supervised audiovisual matching","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Hu"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3571730"},{"key":"ref29","article-title":"Vision-by-language for training-free compositional image retrieval","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Karthik"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref31","first-page":"7774","article-title":"Cooperative learning of audio and video models from self-supervised synchronization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Korbar"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.5244\/C.35.116"},{"key":"ref33","article-title":"Supervision exists everywhere: A data efficient contrastive language-image pre-training paradigm","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Li"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2022.103602"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548317"},{"key":"ref36","first-page":"37524","article-title":"A closer look at weakly-supervised audio-visual source localization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Mo"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_13"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01274"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01229"},{"key":"ref40","article-title":"Representation learning with contrastive predictive coding","author":"van den Oord","year":"2018"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_48"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-69544-6_8"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/icassp49357.2023.10097234"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_18"},{"key":"ref46","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095091"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01850"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00458"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2952095"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747867"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00065"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00715"},{"key":"ref55","first-page":"3222","article-title":"Self-supervised predictive learning: A negative-free method for sound source localization in visual scenes","volume-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","author":"Song"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00621"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00622"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00393"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3176690"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00110"},{"key":"ref63","first-page":"26497","article-title":"Unraveling instance associations: A closer look for audio-visual segmentation","volume-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","author":"Yuanhong"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00612"},{"key":"ref65","article-title":"Diagnosing and rectifying vision models using language","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zhang"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.544"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02261-x"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_22"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/11118328\/11016078.pdf?arnumber=11016078","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,7]],"date-time":"2025-08-07T17:44:13Z","timestamp":1754588653000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11016078\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9]]},"references-count":68,"journal-issue":{"issue":"9"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3573994","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9]]}}}