{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T10:05:08Z","timestamp":1767261908106,"version":"3.37.3"},"reference-count":56,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Science and Technology Innovation Committee of Shenzhen Municipality Foundation","award":["JCYJ20210324132203007"],"award-info":[{"award-number":["JCYJ20210324132203007"]}]},{"name":"National Key Research and Development Program of China","award":["2018AAA0102200"],"award-info":[{"award-number":["2018AAA0102200"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Multimedia"],"published-print":{"date-parts":[[2023]]},"DOI":"10.1109\/tmm.2022.3200282","type":"journal-article","created":{"date-parts":[[2022,8,19]],"date-time":"2022-08-19T19:31:45Z","timestamp":1660937505000},"page":"5864-5876","source":"Crossref","is-referenced-by-count":6,"title":["Self-Supervised Fine-Grained Cycle-Separation Network (FSCN) for Visual-Audio Separation"],"prefix":"10.1109","volume":"25","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9122-6141","authenticated-orcid":false,"given":"Yanli","family":"Ji","sequence":"first","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shuo","family":"Ma","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5685-3123","authenticated-orcid":false,"given":"Xing","family":"Xu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2924-946X","authenticated-orcid":false,"given":"Xuelong","family":"Li","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University, Xi&#x0027;an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2999-2088","authenticated-orcid":false,"given":"Heng Tao","family":"Shen","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01219-9_3"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00398"},{"key":"ref56","first-page":"367","article-title":"Mir-eval: A transparent implementation of common MIR metrics","author":"raffel","year":"0","journal-title":"Proc ISMIR"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00182"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00097"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref52","first-page":"234","article-title":"U-Net: Convolutional networks for biomedical image segmentation","author":"ronneberger","year":"0","journal-title":"Proc Int Conf Med Image Comput Comput - Assist Interv"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_35"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/TSA.2005.858005"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201357"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58610-2_4"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01049"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00816"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2018.2883607"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.5244\/C.25.75"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2018.8545493"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2019.2928180"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2019.2901853"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1984.1164317"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00197"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1955"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_18"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682061"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01316-z"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-53547-0_25"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7471631"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952154"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1162\/0899766054322964"},{"key":"ref3","doi-asserted-by":"crossref","first-page":"20","DOI":"10.1109\/TCYB.2015.2391252","article-title":"Fast sound source localization using two-level search space clustering","volume":"46","author":"dongsuk","year":"2016","journal-title":"IEEE Trans Cybern"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-22482-4_50"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1016\/j.cub.2009.09.005"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1400"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2016.2580946"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2006.885253"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2017.2716443"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1550"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2992393"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00277"},{"key":"ref32","first-page":"10077","article-title":"Discriminative sounding objects localization via self-supervised audiovisual matching","author":"hu","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TSMCB.2012.2226443"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3050089"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3061800"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2937185"},{"key":"ref24","first-page":"7774","article-title":"Cooperative learning of audio and video models from self-supervised synchronization","author":"bruno","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093345"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_26"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00947"},{"key":"ref22","first-page":"631","article-title":"Audio-visual scene analysis with self-supervised multisensory features","author":"owens","year":"0","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1819"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00879"},{"key":"ref29","first-page":"1","article-title":"Disjoint mapping network for cross-modal matching of voices and faces","author":"wen","year":"0","journal-title":"Proc Int Conf Learn Representations"}],"container-title":["IEEE Transactions on Multimedia"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6046\/10016790\/09863655.pdf?arnumber=9863655","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,11]],"date-time":"2023-12-11T20:22:39Z","timestamp":1702326159000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9863655\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"references-count":56,"URL":"https:\/\/doi.org\/10.1109\/tmm.2022.3200282","relation":{},"ISSN":["1520-9210","1941-0077"],"issn-type":[{"type":"print","value":"1520-9210"},{"type":"electronic","value":"1941-0077"}],"subject":[],"published":{"date-parts":[[2023]]}}}