{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:57:33Z","timestamp":1781539053698,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":31,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810748","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"1547-1552","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Efficient Music Denoising with Channel Attention and Multi-Scale Sequence Encoding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-4683-4070","authenticated-orcid":false,"given":"Seungmin","family":"Ha","sequence":"first","affiliation":[{"name":"College of Computer Science and Artificial Intelligence, Shanghai Key Laboratory of Intelligent Information Processing, Fudan University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4486-8341","authenticated-orcid":false,"given":"Wei","family":"Li","sequence":"additional","affiliation":[{"name":"College of Computer Science and Artificial Intelligence, Shanghai Key Laboratory of Intelligent Information Processing, Fudan University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9080-9190","authenticated-orcid":false,"given":"Yulun","family":"Wu","sequence":"additional","affiliation":[{"name":"College of Computer Science and Artificial Intelligence, Shanghai Key Laboratory of Intelligent Information Processing, Fudan University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888409"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3523227.3546788"},{"key":"e_1_3_3_2_4_2","unstructured":"Rewon Child. 2019. Generating long sequences with sparse transformers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1904.10509 (2019)."},{"key":"e_1_3_3_2_5_2","volume-title":"International Conference on Learning Representations","author":"Choi Hyeong-Seok","year":"2018","unstructured":"Hyeong-Seok Choi, Jang-Hyun Kim, Jaesung Huh, Adrian Kim, Jung-Woo Ha, and Kyogu Lee. 2018. Phase-aware speech enhancement with deep complex u-net. In International Conference on Learning Representations."},{"key":"e_1_3_3_2_6_2","first-page":"933","volume-title":"International conference on machine learning","author":"Dauphin Yann\u00a0N","year":"2017","unstructured":"Yann\u00a0N Dauphin, Angela Fan, Michael Auli, and David Grangier. 2017. Language modeling with gated convolutional networks. In International conference on machine learning. PMLR, 933\u2013941."},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461861"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"e_1_3_3_2_9_2","unstructured":"Yanxin Hu Yun Liu Shubo Lv Mengtao Xing Shimin Zhang Yihui Fu Jian Wu Bihong Zhang and Lei Xie. 2020. DCCRN: Deep complex convolution recurrent network for phase-aware speech enhancement. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2008.00264 (2020)."},{"key":"e_1_3_3_2_10_2","unstructured":"Andreas Jansson Eric Humphrey Nicola Montecchio Rachel Bittner Aparna Kumar and Tillman Weyde. 2017. Singing voice separation with deep u-net convolutional networks. (2017)."},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"crossref","unstructured":"Don\u00a0H Johnson. 2006. Signal-to-noise ratio. Scholarpedia 1 12 (2006) 2088.","DOI":"10.4249\/scholarpedia.2088"},{"key":"e_1_3_3_2_12_2","unstructured":"Yunpeng Li Beat Gfeller Marco Tagliasacchi and Dominik Roblek. 2020. Learning to denoise historical music. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2008.02027 (2020)."},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"crossref","unstructured":"Yi Luo and Nima Mesgarani. 2019. Conv-tasnet: Surpassing ideal time\u2013frequency magnitude masking for speech separation. IEEE\/ACM transactions on audio speech and language processing 27 8 (2019) 1256\u20131266.","DOI":"10.1109\/TASLP.2019.2915167"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"Vinaya Manchaiah Steve Taddei Abram Bailey De\u00a0Wet Swanepoel Hansapani Rodrigo and Andrew Sabin. 2024. A novel consumer-centric metric for evaluating hearing device audio performance. Frontiers in Audiology and Otology 2 (2024) 1406362.","DOI":"10.3389\/fauot.2024.1406362"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"crossref","unstructured":"Mohamed Medani Nasir Saleem Fethi Fkih Manal\u00a0Abdullah Alohali Hela Elmannai and Sami Bourouis. 2025. End-to-end feature fusion for jointly optimized speech enhancement and automatic speech recognition. Scientific Reports 15 1 (2025) 22892.","DOI":"10.1038\/s41598-025-05057-2"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746977"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"crossref","unstructured":"Santiago Pascual Antonio Bonafonte and Joan Serra. 2017. SEGAN: Speech enhancement generative adversarial network. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1703.09452 (2017).","DOI":"10.21437\/Interspeech.2017-1428"},{"key":"e_1_3_3_2_18_2","unstructured":"Zafar Rafii Antoine Liutkus Fabian-Robert St\u00f6ter Stylianos\u00a0Ioannis Mimilakis and Rachel Bittner. 2017. The MUSDB18 corpus for music separation. (2017)."},{"key":"e_1_3_3_2_19_2","unstructured":"Daniel Stoller Sebastian Ewert and Simon Dixon. 2018. Wave-u-net: A multi-scale neural network for end-to-end audio source separation. Proceedings of the 19th International Society for Music Information Retrieval Conference (2018)."},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA.2019.8937222"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"crossref","unstructured":"Maximilian Strake Bruno Defraene Kristoff Fluyt Wouter Tirry and Tim Fingscheidt. 2020. Speech enhancement by LSTM-based noise suppression followed by CNN-based speech restoration. EURASIP Journal on Advances in Signal Processing 2020 1 (2020) 49.","DOI":"10.1186\/s13634-020-00707-1"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413901"},{"key":"e_1_3_3_2_23_2","first-page":"74","volume-title":"Machine Learning in Multimedia","author":"Swarnkar Suman\u00a0Kumar","year":"2025","unstructured":"Suman\u00a0Kumar Swarnkar and Yogesh\u00a0Kumar Rathore. 2025. Music Genre Classification Using Long Short-Term Memory (LSTM) Networks: Analyzing Audio Spectrograms for Enhanced Multimedia Understanding. In Machine Learning in Multimedia. CRC Press, 74\u201384."},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447090"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/MLSP49062.2020.9231900"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952158"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"crossref","unstructured":"Vincent Emmanuel and Gribonval. 2006. Performance measurement in blind audio source separation. IEEE transactions on audio speech and language processing 14 4 (2006) 1462\u20131469.","DOI":"10.1109\/TSA.2005.858005"},{"key":"e_1_3_3_2_28_2","first-page":"11534","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","author":"W Qilong","year":"2020","unstructured":"Qilong W, Banggu W, Pengfei Z, Peihua L, Wangmeng Z, and Qinghua H. 2020. ECA-Net: Efficient channel attention for deep convolutional neural networks. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 11534\u201311542."},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"crossref","unstructured":"Donald\u00a0S Williamson Yuxuan Wang and DeLiang Wang. 2015. Complex ratio masking for monaural speech separation. IEEE\/ACM transactions on audio speech and language processing 24 3 (2015) 483\u2013492.","DOI":"10.1109\/TASLP.2015.2512042"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"crossref","unstructured":"Yong Xu Jun Du Li-Rong Dai and Chin-Hui Lee. 2014. A regression approach to speech enhancement based on deep neural networks. IEEE\/ACM transactions on audio speech and language processing 23 1 (2014) 7\u201319.","DOI":"10.1109\/TASLP.2014.2364452"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"crossref","unstructured":"Luo Yi and Jianwei Yu. 2023. Music source separation with band-split RNN. IEEE\/ACM Transactions on Audio Speech and Language Processing 31 (2023) 1893\u20131901.","DOI":"10.1109\/TASLP.2023.3271145"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6489"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:40:29Z","timestamp":1781538029000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810748"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":31,"alternative-id":["10.1145\/3805622.3810748","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810748","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}