{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T15:52:28Z","timestamp":1781193148516,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681550","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"1652-1661","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":16,"title":["MSFNet: Multi-Scale Fusion Network for Brain-Controlled Speaker Extraction"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6318-8803","authenticated-orcid":false,"given":"Cunhang","family":"Fan","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Anhui University, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1173-2970","authenticated-orcid":false,"given":"Jingjing","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Anhui University, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0032-7566","authenticated-orcid":false,"given":"Hongyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Anhui University, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8894-1131","authenticated-orcid":false,"given":"Wang","family":"Xiang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Anhui University, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0477-587X","authenticated-orcid":false,"given":"Jianhua","family":"Tao","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5522-0037","authenticated-orcid":false,"given":"Xinhui","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Anhui University, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2422-4618","authenticated-orcid":false,"given":"Jiangyan","family":"Yi","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5200-2265","authenticated-orcid":false,"given":"Dianbo","family":"Sui","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Weihai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4530-4422","authenticated-orcid":false,"given":"Zhao","family":"Lv","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Anhui University, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018--1400"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cub.2018.01.080"},{"key":"e_1_3_2_1_3_1","volume-title":"The cocktail party phenomenon: A review of research on speech intelligibility in multiple-talker conditions. Acta Acustica united with Acustica 86, 1","author":"Bronkhorst Adelbert W","year":"2000","unstructured":"Adelbert W Bronkhorst. 2000. The cocktail party phenomenon: A review of research on speech intelligibility in multiple-talker conditions. Acta Acustica united with Acustica 86, 1 (2000), 117--128."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447633"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neuroimage.2020.117282"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611869"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2064307"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jneumeth.2003.10.009"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746221"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1088\/1741-2552\/ac16b4"},{"key":"e_1_3_2_1_11_1","volume-title":"Speaker-independent auditory attention decoding without access to clean speech sources. Science advances 5, 5","author":"Han Cong","year":"2019","unstructured":"Cong Han, James O'Sullivan, Yi Luo, Jose Herrero, Ashesh D Mehta, and Nima Mesgarani. 2019. Speaker-independent auditory attention decoding without access to clean speech sources. Science advances 5, 5 (2019), eaav6134."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414969"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3169629"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.bspc.2022.104408"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053340"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683855"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054266"},{"key":"e_1_3_2_1_18_1","volume-title":"Conv-tasnet: Surpassing ideal time--frequency magnitude masking for speech separation","author":"Luo Yi","year":"2019","unstructured":"Yi Luo and Nima Mesgarani. 2019. Conv-tasnet: Surpassing ideal time--frequency magnitude masking for speech separation. IEEE\/ACM transactions on audio, speech, and language processing 27, 8 (2019), 1256--1266."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.heares.2020.107994"},{"key":"e_1_3_2_1_20_1","volume-title":"Nima Mesgarani, Siddharth Rajaram, John J Foxe, Barbara G Shinn-Cunningham, Malcolm Slaney, Shihab A Shamma, and Edmund C Lalor.","author":"O'sullivan James A","year":"2015","unstructured":"James A O'sullivan, Alan J Power, Nima Mesgarani, Siddharth Rajaram, John J Foxe, Barbara G Shinn-Cunningham, Malcolm Slaney, Shihab A Shamma, and Edmund C Lalor. 2015. Attentional selection in a cocktail party environment can be decoded from single-trial EEG. Cerebral cortex 25, 7 (2015), 1697--1706."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1088\/1741-2552\/aa7ab4"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3205759"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3175130"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11671"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682514"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2001.941023"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2016.7846260"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413901"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2010.5495701"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","first-page":"e0205355","DOI":"10.1371\/journal.pone.0205355","article-title":"Multi-resolution speech analysis for automatic speech recognition using deep neural networks","volume":"13","author":"Toledano Doroteo T","year":"2018","unstructured":"Doroteo T Toledano, Mar\u00eda Pilar Fern\u00e1ndez-Gallego, and Alicia Lozano-Diez. 2018. Multi-resolution speech analysis for automatic speech recognition using deep neural networks: Experiments on TIMIT. PloS one 13, 10 (2018), e0205355.","journal-title":"Experiments on TIMIT. PloS one"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462665"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1101"},{"key":"e_1_3_2_1_33_1","volume-title":"TF-GridNet: Integrating full-and sub-band modeling for speech separation","author":"Wang Zhong-Qiu","year":"2023","unstructured":"Zhong-Qiu Wang, Samuele Cornell, Shukjae Choi, Younglo Lee, Byeong-Yeol Kim, and Shinji Watanabe. 2023. TF-GridNet: Integrating full-and sub-band modeling for speech separation. IEEE\/ACM Transactions on Audio, Speech, and Language Processing (2023)."},{"key":"e_1_3_2_1_34_1","volume-title":"Time domain audio visual speech separation. In 2019 IEEE automatic speech recognition and understanding workshop (ASRU)","author":"Xu Yong","unstructured":"JianWu, Yong Xu, Shi-Xiong Zhang, Lian-Wu Chen, Meng Yu, Lei Xie, and Dong Yu. 2019. Time domain audio visual speech separation. In 2019 IEEE automatic speech recognition and understanding workshop (ASRU). IEEE, 667--673."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9004016"},{"key":"e_1_3_2_1_36_1","volume-title":"Eng Siong Chng, and Haizhou Li","author":"Xu Chenglin","year":"2020","unstructured":"Chenglin Xu, Wei Rao, Eng Siong Chng, and Haizhou Li. 2020. Spex: Multi-scale time domain speaker extraction network. IEEE\/ACM transactions on audio, speech, and language processing 28 (2020), 1370--1384."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612424"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096664"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447529"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-673"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10445985"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.5055562"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681550","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681550","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:48Z","timestamp":1750295868000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681550"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":42,"alternative-id":["10.1145\/3664647.3681550","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681550","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}