{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,21]],"date-time":"2025-12-21T06:25:06Z","timestamp":1766298306641,"version":"3.44.0"},"reference-count":43,"publisher":"Springer Science and Business Media LLC","issue":"31","license":[{"start":{"date-parts":[[2024,8,20]],"date-time":"2024-08-20T00:00:00Z","timestamp":1724112000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,8,20]],"date-time":"2024-08-20T00:00:00Z","timestamp":1724112000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-024-20038-9","type":"journal-article","created":{"date-parts":[[2024,8,22]],"date-time":"2024-08-22T02:59:25Z","timestamp":1724295565000},"page":"37655-37669","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Music source separation via hybrid waveform and spectrogram based generative adversarial network"],"prefix":"10.1007","volume":"84","author":[{"given":"Qiuxia","family":"Wu","sequence":"first","affiliation":[]},{"given":"Haipeng","family":"Deng","sequence":"additional","affiliation":[]},{"given":"Kun","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Zhiyong","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,8,20]]},"reference":[{"key":"20038_CR1","unstructured":"Arjovsky M, Chintala S, Bottou L (2017) Wasserstein generative adversarial networks. In International conference on machine learning, pp 214\u2013223. PMLR"},{"key":"20038_CR2","doi-asserted-by":"crossref","unstructured":"Cherry EC (1953) Some experiments on the recognition of speech, with one and with two ears. J Acoust Soc Am 25(5):975\u2013979","DOI":"10.1121\/1.1907229"},{"key":"20038_CR3","unstructured":"D\u00e9fossez A (2021) Hybrid spectrogram and waveform source separation. In ISMIR Workshop on Music Source Separation"},{"key":"20038_CR4","unstructured":"D\u00e9fossez A, Usunier N, Bottou L, Bach F (2019) Music source separation in the waveform domain. arXiv:1911.13254"},{"key":"20038_CR5","doi-asserted-by":"crossref","unstructured":"Durrieu JL, David B, Richard G (2011) A musically motivated mid-level representation for pitch estimation and musical audio source separation. IEEE J Sel Top Signal Process 5(6):1180\u20131191","DOI":"10.1109\/JSTSP.2011.2158801"},{"key":"20038_CR6","unstructured":"Fabbro G, Uhlich S, Lai CH, Choi W, Ram\u00edrez MM, Liao W, Gadelha I, Ramos G, Hsu E, Rodrigues H etal (2021) The sound demixing challenge 2023\u2013music demixing track"},{"key":"20038_CR7","doi-asserted-by":"crossref","unstructured":"Fan ZH, Lai YL, Jang JSR (2018) Svsgan: Singing voice separation via generative adversarial network. In ICASSP pp 726\u2013730. IEEE","DOI":"10.1109\/ICASSP.2018.8462091"},{"key":"20038_CR8","doi-asserted-by":"crossref","unstructured":"Goodfellow I, Pouget-Abadie J, Mirza M, Bing X, Warde-Farley D, Ozair S, Courville A, Bengio Y (2020) Generative adversarial networks. Commun ACM 63(11):139\u2013144","DOI":"10.1145\/3422622"},{"key":"20038_CR9","unstructured":"Gulrajani I, Ahmed F, Arjovsky M, Dumoulin V, Courville AC (2017) Improved training of wasserstein gans. Adv Neural Inf Process Syst 30"},{"key":"20038_CR10","doi-asserted-by":"crossref","unstructured":"Hennequin R, Khlif A, Voituret F, Moussallam M (2020) Spleeter: a fast and efficient music source separation tool with pre-trained models. J Open Source Softw 5(50):2154","DOI":"10.21105\/joss.02154"},{"key":"20038_CR11","unstructured":"Heusel M, Ramsauer H, Unterthiner T, Nessler B, Hochreiter S (2017) Gans trained by a two time-scale update rule converge to a local nash equilibrium. Adv Neural Inf Process Syst 30"},{"key":"20038_CR12","doi-asserted-by":"crossref","unstructured":"Hsu CL, Wang D, Jang JSR, Hu K (2012) A tandem algorithm for singing pitch extraction and voice separation from music accompaniment. IEEE Transactions on audio, speech, and language processing, 20(5):1482\u20131491","DOI":"10.1109\/TASL.2011.2182510"},{"key":"20038_CR13","doi-asserted-by":"crossref","unstructured":"Huang PS, Kim M, Hasegawa-Johnson M, Smaragdis P (2015) Joint optimization of masks and deep recurrent neural networks for monaural source separation. IEEE\/ACM Transactions on Audio, Speech, and Language Processing 23(12):2136\u20132147","DOI":"10.1109\/TASLP.2015.2468583"},{"key":"20038_CR14","unstructured":"Hyvarinen A, Karhunen J, Oja E (2002) Independent component analysis. Studies in informatics and control 11(2):205\u2013207"},{"key":"20038_CR15","doi-asserted-by":"crossref","unstructured":"Kim J, Kang HG (2023) Contrastive learning based deep latent masking for music source separation. In Proceedings of INTERSPEECH pp 3709\u20133713","DOI":"10.21437\/Interspeech.2023-1723"},{"key":"20038_CR16","unstructured":"Kim M, Choi W, Chung J, Lee D, Jung S (2021) Kuielab-mdx-net: A two-stream neural network for music demixing. arXiv:2111.12203"},{"key":"20038_CR17","unstructured":"Kim M, Lee JH (2023) Sound demixing challenge 2023\u2013music demixing track technical report. arXiv:2306.09382"},{"key":"20038_CR18","unstructured":"Kingma DP, Ba J (2014) Adam: A method for stochastic optimization. arXiv:1412.6980"},{"key":"20038_CR19","unstructured":"Kong Q, Cao Y, Liu H, Choi K, Wang Y (2021) Decoupling magnitude and phase estimation with deep resunet for music source separation. arXiv:2109.05418"},{"key":"20038_CR20","doi-asserted-by":"crossref","unstructured":"Lee DD, Seung HS (1999) Learning the parts of objects by non-negative matrix factorization. Nat 401(6755):788\u2013791","DOI":"10.1038\/44565"},{"key":"20038_CR21","doi-asserted-by":"crossref","unstructured":"Luo Y, Mesgarani N (2019) Conv-tasnet: Surpassing ideal time-frequency magnitude masking for speech separation. IEEE\/ACM transactions on audio, speech, and language processing 27(8):1256\u20131266","DOI":"10.1109\/TASLP.2019.2915167"},{"key":"20038_CR22","doi-asserted-by":"crossref","unstructured":"Mao X, Li Q, Xie H, Lau RY, Wang Z, Paul\u00a0Smolley S (2017) Least squares generative adversarial networks. In Proceedings of the IEEE international conference on computer vision pp 2794\u20132802","DOI":"10.1109\/ICCV.2017.304"},{"key":"20038_CR23","unstructured":"Mirza M, Osindero S (2014) Conditional generative adversarial nets. arXiv:1411.1784"},{"key":"20038_CR24","doi-asserted-by":"crossref","unstructured":"Mitsufuji Y, Fabbro G, Uhlich S, St\u00f6ter FR (2021) Music demixing challenge 2021. arXiv:2108.13559","DOI":"10.3389\/frsip.2021.808395"},{"key":"20038_CR25","unstructured":"Miyato T, Kataoka T, Koyama M, Yoshida Y (2018) Spectral normalization for generative adversarial networks. arXiv:1802.05957"},{"key":"20038_CR26","unstructured":"Nowozin S, Cseke B, Tomioka R (2016) f-gan: Training generative neural samplers using variational divergence minimization. Advances in neural information processing systems 29"},{"key":"20038_CR27","unstructured":"Odena A, Olah C, Shlens J (2017) Conditional image synthesis with auxiliary classifier gans. In International conference on machine learning pp 2642\u20132651. PMLR"},{"key":"20038_CR28","unstructured":"Radford A, Metz L, Chintala S (2015) Unsupervised representation learning with deep convolutional generative adversarial networks. arXiv:1511.06434"},{"key":"20038_CR29","unstructured":"Rafii Z, Liutkus A, St\u00f6ter FR, Mimilakis SI, Bittner R (2017) The MUSDB18 corpus for music separation, December 2017"},{"key":"20038_CR30","doi-asserted-by":"crossref","unstructured":"Rouard S, Massa F, D\u00e9fossez A (2023) Hybrid transformers for music source separation. In ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp 1\u20135. IEEE","DOI":"10.1109\/ICASSP49357.2023.10096956"},{"key":"20038_CR31","doi-asserted-by":"crossref","unstructured":"Satya MF, Suyanto S (2020) Music source separation using generative adversarial network and u-net. In ICoICT, pp 1\u20136. IEEE","DOI":"10.1109\/ICoICT49345.2020.9166374"},{"key":"20038_CR32","doi-asserted-by":"crossref","unstructured":"Serra J, G\u00f3mez E, Herrera P (2010) Audio cover song identification and similarity: background, approaches, evaluation, and beyond. In Advances in music information retrieval pp 307\u2013332. Springer","DOI":"10.1007\/978-3-642-11674-2_14"},{"key":"20038_CR33","unstructured":"Stoller D, Ewert S, Dixon S (2018) Wave-u-net: A multi-scale neural network for end-to-end audio source separation. arXiv:1806.03185"},{"key":"20038_CR34","doi-asserted-by":"crossref","unstructured":"St\u00f6ter FR, Liutkus A, Ito N (2018) The 2018 signal separation evaluation campaign. In International Conference on Latent Variable Analysis and Signal Separation pp 293\u2013305. Springer","DOI":"10.1007\/978-3-319-93764-9_28"},{"key":"20038_CR35","doi-asserted-by":"crossref","unstructured":"St\u00f6ter FR, Uhlich S, Liutkus A, Mitsufuji Y (2019) Open-unmix-a reference implementation for music source separation. J Open Source Softw 4(41):1667","DOI":"10.21105\/joss.01667"},{"key":"20038_CR36","doi-asserted-by":"crossref","unstructured":"Takahashi N, Goswami N, Mitsufuji Y (2018) Mmdenselstm: An efficient combination of convolutional and recurrent neural networks for audio source separation. In IWAENC pp 106\u2013110. IEEE","DOI":"10.1109\/IWAENC.2018.8521383"},{"key":"20038_CR37","unstructured":"Takahashi N, Mitsufuji Y (2020) D3net: Densely connected multidilated densenet for music source separation. arXiv:2010.01733"},{"key":"20038_CR38","doi-asserted-by":"crossref","unstructured":"Uhlich S, Giron F, Mitsufuji Y (2015) Deep neural network based instrument extraction from music. In ICASSP pp 2135\u20132139. IEEE","DOI":"10.1109\/ICASSP.2015.7178348"},{"key":"20038_CR39","unstructured":"Vembu S, Baumann S (2005) Separation of vocals from polyphonic audio recordings. In ISMIR pp 337\u2013344. Citeseer"},{"key":"20038_CR40","doi-asserted-by":"crossref","unstructured":"Virtanen T (2007) Monaural sound source separation by nonnegative matrix factorization with temporal continuity and sparseness criteria. IEEE Trans Audio Speech Lang Process 15(3):1066\u20131074","DOI":"10.1109\/TASL.2006.885253"},{"key":"20038_CR41","doi-asserted-by":"crossref","unstructured":"Zhang H, Xu T, Li H, Zhang S, Wang X, Huang X, Metaxas DN (2018) Stackgan++: Realistic image synthesis with stacked generative adversarial networks. IEEE transactions on pattern analysis and machine intelligence 41(8):1947\u20131962","DOI":"10.1109\/TPAMI.2018.2856256"},{"key":"20038_CR42","doi-asserted-by":"crossref","unstructured":"Zhang H, Xiao N, Liu P, Wang Z, Tang R (2020) G-rnn-gan for singing voice separation. In Proceedings of the 2020 5th International Conference on Multimedia Systems and Signal Processing pp 69\u201373","DOI":"10.1145\/3404716.3404718"},{"key":"20038_CR43","doi-asserted-by":"crossref","unstructured":"Wen Z, Yonghui Z, Yanjun S, Jie S (2021) Stereo feature enhancement and temporal information extraction network for automatic music transcription. IEEE Signal Process Lett 28:1500\u20131504","DOI":"10.1109\/LSP.2021.3099073"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-20038-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-024-20038-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-20038-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T09:44:28Z","timestamp":1758102268000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-024-20038-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,20]]},"references-count":43,"journal-issue":{"issue":"31","published-online":{"date-parts":[[2025,9]]}},"alternative-id":["20038"],"URL":"https:\/\/doi.org\/10.1007\/s11042-024-20038-9","relation":{},"ISSN":["1573-7721"],"issn-type":[{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2024,8,20]]},"assertion":[{"value":"7 October 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 July 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 August 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 August 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Haipeng Deng, Kun Hu, Qiuxia Wu and Zhiyong Wang declare that no conflict of interest could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of interest"}}]}}