{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:34:34Z","timestamp":1776890074243,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Science Foundation of China","award":["62272466"],"award-info":[{"award-number":["62272466"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680985","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"8623-8632","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["MAJL: A Model-Agnostic Joint Learning Framework for Music Source Separation and Pitch Estimation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2716-7866","authenticated-orcid":false,"given":"Haojie","family":"Wei","sequence":"first","affiliation":[{"name":"School of Information, Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6003-9714","authenticated-orcid":false,"given":"Jun","family":"Yuan","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8132-6250","authenticated-orcid":false,"given":"Rui","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Huazhong University of Science and Technology (www.ruizhang.info), Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7578-2738","authenticated-orcid":false,"given":"Quanyu","family":"Dai","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2239-4472","authenticated-orcid":false,"given":"Yueguo","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Information, Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Automatic Music Transcription: An Overview","author":"Benetos Emmanouil","year":"2018","unstructured":"Emmanouil Benetos, Simon Dixon, Zhiyao Duan, and Sebastian Ewert. 2018. Automatic Music Transcription: An Overview. IEEE Signal Processing Magazine (2018), 20--30."},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of the International Society for Music Information Retrieval Conference (ISMIR). 155--160","author":"Bittner Rachel M","year":"2014","unstructured":"Rachel M Bittner, Justin Salamon, Mike Tierney, Matthias Mauch, Chris Cannam, and Juan Pablo Bello. 2014. Medleydb: A multitrack dataset for annotation-intensive mir research.. In Proceedings of the International Society for Music Information Retrieval Conference (ISMIR). 155--160."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"A. Camacho and J. G. Harris. 2008. A sawtooth waveform inspired pitch estimator for speech and music. In The Journal of the Acoustical Society of America. 1638--1652.","DOI":"10.1121\/1.2951592"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Xuxin Cheng Bowen Cao Qichen Ye Zhihong Zhu Hongxiang Li and Yuexian Zou. 2023. Ml-lmcl: Mutual learning and large-margin contrastive learning for improving asr robustness in spoken language understanding. In Findings of the Association for Computational Linguistics (ACL). 6492--6505.","DOI":"10.18653\/v1\/2023.findings-acl.406"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Xuxin Cheng Zhihong Zhu Bowen Cao Qichen Ye and Yuexian Zou. 2023. Mrrl: Modifying the reference via reinforcement learning for non-autoregressive joint multiple intent detection and slot filling. In Findings of the Association for Computational Linguistics (EMNLP). 10495--10505.","DOI":"10.18653\/v1\/2023.findings-emnlp.704"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29738"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.597"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Alain De Cheveign\u00e9 and Hideki Kawahara. 2002. YIN a fundamental frequency estimator for speech and music. In The Journal of the Acoustical Society of America. 1917--1930.","DOI":"10.1121\/1.1458024"},{"key":"e_1_3_2_1_9_1","volume-title":"Hybrid spectrogram and waveform source separation. arXiv preprint arXiv:2111.03600","author":"D\u00e9fossez Alexandre","year":"2021","unstructured":"Alexandre D\u00e9fossez. 2021. Hybrid spectrogram and waveform source separation. arXiv preprint arXiv:2111.03600 (2021)."},{"key":"e_1_3_2_1_10_1","volume-title":"Music source separation in the waveform domain. arXiv preprint arXiv:1911.13254","author":"D\u00e9fossez Alexandre","year":"2019","unstructured":"Alexandre D\u00e9fossez, Nicolas Usunier, L\u00e9on Bottou, and Francis Bach. 2019. Music source separation in the waveform domain. arXiv preprint arXiv:1911.13254 (2019)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Mingye Dong Jie Wu and Jian Luan. 2019. Vocal Pitch Extraction in Polyphonic Music Using Convolutional Residual Network. In INTERSPEECH. 2010--2014.","DOI":"10.21437\/Interspeech.2019-2286"},{"key":"e_1_3_2_1_12_1","volume-title":"Soundprism: An online system for score-informed source separation of music audio","author":"Duan Zhiyao","year":"2011","unstructured":"Zhiyao Duan and Bryan Pardo. 2011. Soundprism: An online system for score-informed source separation of music audio. IEEE Journal of Selected Topics in Signal Processing (2011), 1205--1215."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1976.1162765"},{"key":"e_1_3_2_1_14_1","volume-title":"Plumbley","author":"Ewert Sebastian","year":"2014","unstructured":"Sebastian Ewert, Bryan Pardo, Meinard Muller, and Mark D. Plumbley. 2014. Score-Informed Source Separation for Musical Audio Recordings: An overview. IEEE Signal Processing Magazine (2014), 116--124."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigMM.2016.56"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Yuan Gao Ying Hu Liusong Wang Hao Huang and Liang He. 2023. MTANet: Multi-band Time-frequency Attention Network for Singing Melody Extraction from Polyphonic Music. In INTERSPEECH. 5396--5400.","DOI":"10.21437\/Interspeech.2023-2494"},{"key":"e_1_3_2_1_17_1","volume-title":"Vocal melody extraction via hrnet-based singing voice separation and encoder-decoder-based f0 estimation. Electronics","author":"Gao Yongwei","year":"2021","unstructured":"Yongwei Gao, Xulong Zhang, and Wei Li. 2021. Vocal melody extraction via hrnet-based singing voice separation and encoder-decoder-based f0 estimation. Electronics (2021), 298."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.2982285"},{"key":"e_1_3_2_1_19_1","volume-title":"Spleeter: a fast and efficient music source separation tool with pre-trained models. Journal of Open Source Software","author":"Hennequin Romain","year":"2020","unstructured":"Romain Hennequin, Anis Khlif, Felix Voituret, and Manuel Moussallam. 2020. Spleeter: a fast and efficient music source separation tool with pre-trained models. Journal of Open Source Software (2020), 2154."},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the IEEE Transactions on Audio, Speech, and Language Processing (TASLP). 310--319","author":"Hsu Chao-Ling","year":"2010","unstructured":"Chao-Ling Hsu and Jyh-Shing Roger Jang. 2010. On the Improvement of Singing Voice Separation for Monaural Recordings Using the MIR-1K Dataset. In Proceedings of the IEEE Transactions on Audio, Speech, and Language Processing (TASLP). 310--319."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2182510"},{"key":"e_1_3_2_1_22_1","unstructured":"iResearch. 2019. China's Digital Music Market Will Face A New Round of Competition. In iResearch. http:\/\/www.iresearchchina.com\/content\/details7_53675.html"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO.2019.8902550"},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the International Society for Music Information Retrieval Conference (ISMIR).","author":"Jansson Andreas","year":"2017","unstructured":"Andreas Jansson, Eric J. Humphrey, Nicola Montecchio, Rachel M. Bittner, Aparna Kumar, and Tillman Weyde. 2017. Singing Voice Separation with Deep U-Net Convolutional Networks. In Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461329"},{"key":"e_1_3_2_1_26_1","volume-title":"Kuielab-mdx-net: A two-stream neural network for music demixing. arXiv preprint arXiv:2111.12203","author":"Kim Minseok","year":"2021","unstructured":"Minseok Kim, Woosung Choi, Jaehwa Chung, Daewon Lee, and Soonyoung Jung. 2021. Kuielab-mdx-net: A two-stream neural network for music demixing. arXiv preprint arXiv:2111.12203 (2021)."},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR).","author":"Diederik","unstructured":"Diederik P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In Proceedings of the International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the International Society for Music Information Retrieval Conference (ISMIR).","author":"Kong Qiuqiang","year":"2021","unstructured":"Qiuqiang Kong, Yin Cao, Haohe Liu, Keunwoo Choi, and Yuxuan Wang. 2021. Decoupling magnitude and phase estimation with deep resunet for music source separation. In Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)."},{"key":"e_1_3_2_1_29_1","volume-title":"Joint Detection and Classification of Singing Voice Melody Using Convolutional Recurrent Neural Networks. Applied Sciences","author":"Kum Sangeun","year":"2019","unstructured":"Sangeun Kum and Juhan Nam. 2019. Joint Detection and Classification of Singing Voice Melody Using Convolutional Recurrent Neural Networks. Applied Sciences (2019), 1324."},{"key":"e_1_3_2_1_30_1","volume-title":"Forecast of Digital Music revenue by segment in the United States from 2017 to","author":"Lindlahr Sebastian","year":"2025","unstructured":"Sebastian Lindlahr. 2021. Forecast of Digital Music revenue by segment in the United States from 2017 to 2025. In Statista. https:\/\/www.statista.com\/forecasts\/460034\/digital-music-revenue-in-the-united-states-forecast"},{"key":"e_1_3_2_1_31_1","volume-title":"CWS-PResUNet: Music source separation with channel-wise subband phase-aware resunet. arXiv preprint arXiv:2112.04685","author":"Liu Haohe","year":"2021","unstructured":"Haohe Liu, Qiuqiang Kong, and Jiafeng Liu. 2021. CWS-PResUNet: Music source separation with channel-wise subband phase-aware resunet. arXiv preprint arXiv:2112.04685 (2021)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671473"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3271145"},{"key":"e_1_3_2_1_34_1","volume-title":"Improved speech enhancement with the wave-u-net. arXiv preprint arXiv:1811.11307","author":"Macartney Craig","year":"2018","unstructured":"Craig Macartney and Tillman Weyde. 2018. Improved speech enhancement with the wave-u-net. arXiv preprint arXiv:1811.11307 (2018)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6853678"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA.2019.8937135"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053424"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746277"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.5281\/zenodo.1117372"},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings of the Twenty-Fifth Conference on Uncertainty in Artificial Intelligence. 452--461","author":"Rendle Steffen","year":"2009","unstructured":"Steffen Rendle, Christoph Freudenthaler, Zeno Gantner, and Lars Schmidt-Thieme. 2009. BPR: Bayesian Personalized Ranking from Implicit Feedback. In Proceedings of the Twenty-Fifth Conference on Uncertainty in Artificial Intelligence. 452--461."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096956"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3091817"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645633"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414050"},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the International Society for Music Information Retrieval Conference (ISMIR).","author":"Stoller Daniel","year":"2018","unstructured":"Daniel Stoller, Sebastian Ewert, and Simon Dixon. 2018. Wave-u-net: A multi-scale neural network for end-to-end audio source separation. In Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414601"},{"key":"e_1_3_2_1_47_1","volume-title":"RMVPE: A Robust Model for Vocal Pitch Estimation in Polyphonic Music. In INTERSPEECH. 5421--5425.","author":"Wei Haojie","year":"2023","unstructured":"Haojie Wei, Xueke Cao, Tangpeng Dan, and Yueguo Chen. 2023. RMVPE: A Robust Model for Vocal Pitch Estimation in Polyphonic Music. In INTERSPEECH. 5421--5425."},{"key":"e_1_3_2_1_48_1","volume-title":"DJCM: A Deep Joint Cascade Model for Singing Voice Separation and Vocal Pitch Estimation. In ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP).","author":"Wei Haojie","year":"2024","unstructured":"Haojie Wei, Xueke Cao, Wenbo Xu, Tangpeng Dan, and Yueguo Chen. 2024. DJCM: A Deep Joint Cascade Model for Singing Voice Separation and Vocal Pitch Estimation. In ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/544"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME52920.2022.9858935"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i1.27790"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680985","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680985","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:35Z","timestamp":1750295855000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680985"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":51,"alternative-id":["10.1145\/3664647.3680985","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680985","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}