{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T05:23:34Z","timestamp":1755926614215,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"This work was supported by National Key Research and Development Program of China","award":["No. 2018AAA0102200"],"award-info":[{"award-number":["No. 2018AAA0102200"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475555","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T04:59:18Z","timestamp":1634533158000},"page":"4202-4210","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Vision-guided Music Source Separation via a Fine-grained Cycle-Separation Network"],"prefix":"10.1145","author":[{"given":"Ma","family":"Shuo","sequence":"first","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yanli","family":"Ji","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xing","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaofeng","family":"Zhu","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"volume-title":"Joon Son Chung, and Andrew Zisserman","year":"2018","author":"Afouras Triantafyllos","key":"e_1_3_2_2_1_1"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"crossref","unstructured":"Relja Arandjelovic and Andrew Zisserman. 2017. Look Listen and Learn. In ICCV.  Relja Arandjelovic and Andrew Zisserman. 2017. Look Listen and Learn. In ICCV.","DOI":"10.1109\/ICCV.2017.73"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"crossref","unstructured":"Relja Arandjelovic and Andrew Zisserman. 2018. Objects that sound. In ECCV.  Relja Arandjelovic and Andrew Zisserman. 2018. Objects that sound. In ECCV.","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.5555\/3327757.3327874"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-53547-0_25"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"crossref","unstructured":"Honglie Chen Weidi Xie Andrea Vedaldi and Andrew Zisserman. 2020. VGGSound: A Large-scale Audio-Visual Dataset. In ICASSP.  Honglie Chen Weidi Xie Andrea Vedaldi and Andrew Zisserman. 2020. VGGSound: A Large-scale Audio-Visual Dataset. In ICASSP.","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.5555\/1822971"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201357"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"crossref","unstructured":"Aviv Gabbay Asaph Shamir and Shmuel Peleg. 2018. Visual speech enhancement. In Interspeech.  Aviv Gabbay Asaph Shamir and Shmuel Peleg. 2018. Visual speech enhancement. In Interspeech.","DOI":"10.21437\/Interspeech.2018-1955"},{"key":"e_1_3_2_2_10_1","unstructured":"Chuang Gan Deng Huang Hang Zhao Joshua B. Tenenbaum and Antonio Torralba. 2020. Music Gesture for Visual Sound Separation. In CVPR.  Chuang Gan Deng Huang Hang Zhao Joshua B. Tenenbaum and Antonio Torralba. 2020. Music Gesture for Visual Sound Separation. In CVPR."},{"volume-title":"Rogrio Schmidt Feris, and Kristen Grauman","year":"2018","author":"Gao Ruohan","key":"e_1_3_2_2_11_1"},{"key":"e_1_3_2_2_12_1","unstructured":"Ruohan Gao and Kristen Grauman. 2019. Co-separating sounds of visual objects. In ICCV.  Ruohan Gao and Kristen Grauman. 2019. Co-separating sounds of visual objects. In ICCV."},{"volume-title":"Audio Set: An ontology and human-labeled dataset for audio events. In ICASSP.","year":"2017","author":"Gemmeke Jort F.","key":"e_1_3_2_2_13_1"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2017.2716443"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1984.1164317"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1162\/0899766054322964"},{"key":"e_1_3_2_2_17_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR.  Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"crossref","unstructured":"Sindhu B Hegde K R Prajwal Rudrabha Mukhopadhyay Vinay Namboodiri and C.V. Jawahar. 2021. Visual Speech Enhancement Without A Real Visual Stream.. In WACV.  Sindhu B Hegde K R Prajwal Rudrabha Mukhopadhyay Vinay Namboodiri and C.V. Jawahar. 2021. Visual Speech Enhancement Without A Real Visual Stream.. In WACV.","DOI":"10.1109\/WACV48630.2021.00197"},{"volume-title":"Jonathan Le Roux, and Shinji Watanabe","year":"2016","author":"Hershey John R.","key":"e_1_3_2_2_19_1"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"crossref","unstructured":"Di Hu Feiping Nie and Xuelong Li. 2019. Deep Multimodal Clustering for Unsupervised Audiovisual Learning. In CVPR.  Di Hu Feiping Nie and Xuelong Li. 2019. Deep Multimodal Clustering for Unsupervised Audiovisual Learning. In CVPR.","DOI":"10.1109\/CVPR.2019.00947"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2992393"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01316-z"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cub.2009.09.005"},{"volume-title":"Torr","year":"2011","author":"Mittal Arpit","key":"e_1_3_2_2_24_1"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"crossref","unstructured":"Giovanni Morrone Sonia Bergamaschi Luca Pasa Luciano Fadiga Vadim Tikhanoff and Leonardo Badino. 2019. Face Landmark-based Speaker-independent Audio-visual Speech Enhancement in Multi-talker Environments. In ICASSP.  Giovanni Morrone Sonia Bergamaschi Luca Pasa Luciano Fadiga Vadim Tikhanoff and Leonardo Badino. 2019. Face Landmark-based Speaker-independent Audio-visual Speech Enhancement in Multi-talker Environments. In ICASSP.","DOI":"10.1109\/ICASSP.2019.8682061"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"crossref","unstructured":"Arsha Nagrani Samuel Albanie and Andrew Zisserman. 2018. Seeing Voices and Hearing Faces: Cross-Modal Biometric Matching. In CVPR.  Arsha Nagrani Samuel Albanie and Andrew Zisserman. 2018. Seeing Voices and Hearing Faces: Cross-Modal Biometric Matching. In CVPR.","DOI":"10.1109\/CVPR.2018.00879"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2016.2580946"},{"volume-title":"Efros","year":"2018","author":"Owens Andrew","key":"e_1_3_2_2_28_1"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"crossref","unstructured":"Rui Qian Di Hu Heinrich Dinkel Mengyue Wu Ning Xu and Weiyao Lin. 2020. Multiple Sound Sources Localization from Coarse to Fine. In ECCV.  Rui Qian Di Hu Heinrich Dinkel Mengyue Wu Ning Xu and Weiyao Lin. 2020. Multiple Sound Sources Localization from Coarse to Fine. In ECCV.","DOI":"10.1007\/978-3-030-58565-5_18"},{"volume-title":"Ellis","year":"2014","author":"Raffel Colin","key":"e_1_3_2_2_30_1"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"volume-title":"Plumbley","year":"2015","author":"Simpson Andrew J.","key":"e_1_3_2_2_32_1"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1819"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"crossref","unstructured":"Naoya Takahashi Sudarsanam Parthasaarathy Nabarun Goswami and Yuki Mitsufuji. 2019. Recursive Speech Separation for Unknown Number of Speakers.. In Interspeech.  Naoya Takahashi Sudarsanam Parthasaarathy Nabarun Goswami and Yuki Mitsufuji. 2019. Recursive Speech Separation for Unknown Number of Speakers.. In Interspeech.","DOI":"10.21437\/Interspeech.2019-1550"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"crossref","unstructured":"Yapeng Tian Dingzeyu Li and Chenliang Xu. 2020. Unified Multisensory Perception: Weakly-Supervised Audio-Visual Video Parsing. In ECCV.  Yapeng Tian Dingzeyu Li and Chenliang Xu. 2020. Unified Multisensory Perception: Weakly-Supervised Audio-Visual Video Parsing. In ECCV.","DOI":"10.1007\/978-3-030-58580-8_26"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2006.885253"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"crossref","unstructured":"Jianren Wang Zhaoyuan Fang and Hang Zhao. 2020. AlignNet: A Unifying Approach to Audio-Visual Alignment. In WACV.  Jianren Wang Zhaoyuan Fang and Hang Zhao. 2020. AlignNet: A Unifying Approach to Audio-Visual Alignment. In WACV.","DOI":"10.1109\/WACV45572.2020.9093345"},{"volume-title":"Weiyang Liu, Bhiksha Raj, and Rita Singh.","year":"2018","author":"Wen Yandong","key":"e_1_3_2_2_38_1"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"crossref","unstructured":"Xudong Xu Bo Dai and Dahua Lin. 2019. Recursive visual sound separation using minus-plus net. In ICCV.  Xudong Xu Bo Dai and Dahua Lin. 2019. Recursive visual sound separation using minus-plus net. In ICCV.","DOI":"10.1109\/ICCV.2019.00097"},{"key":"e_1_3_2_2_40_1","unstructured":"Dong Yu Morten Kolbaek Zheng-Hua Tan and Jesper Jensen. 2017. Permutation invariant training of deep models for speaker-independent multi-talker speech separation. In ICASSP.  Dong Yu Morten Kolbaek Zheng-Hua Tan and Jesper Jensen. 2017. Permutation invariant training of deep models for speaker-independent multi-talker speech separation. In ICASSP."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"crossref","unstructured":"Hang Zhao Chuang Gan Wei-Chiu Ma and Antonio Torralba. 2019. The sound of motions. In ICCV.  Hang Zhao Chuang Gan Wei-Chiu Ma and Antonio Torralba. 2019. The sound of motions. In ICCV.","DOI":"10.1109\/ICCV.2019.00182"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"crossref","unstructured":"Hang Zhao Chuang Gan Andrew Rouditchenko Carl Vondrick Josh H. McDermott and Antonio Torralba. 2018. The sound of pixels. In ECCV.  Hang Zhao Chuang Gan Andrew Rouditchenko Carl Vondrick Josh H. McDermott and Antonio Torralba. 2018. The sound of pixels. In ECCV.","DOI":"10.1007\/978-3-030-01246-5_35"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"crossref","unstructured":"Hang Zhou Xudong Xu Dahua Lin Xiaogang Wang and Ziwei Liu. 2020. Sep-Stereo: Visually Guided Stereophonic Audio Generation by Associating Source Separation.. In ECCV.  Hang Zhou Xudong Xu Dahua Lin Xiaogang Wang and Ziwei Liu. 2020. Sep-Stereo: Visually Guided Stereophonic Audio Generation by Associating Source Separation.. In ECCV.","DOI":"10.1007\/978-3-030-58610-2_4"}],"event":{"name":"MM '21: ACM Multimedia Conference","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Virtual Event China","acronym":"MM '21"},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475555","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475555","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:49:10Z","timestamp":1750193350000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475555"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":43,"alternative-id":["10.1145\/3474085.3475555","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475555","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}