{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,13]],"date-time":"2026-06-13T18:46:14Z","timestamp":1781376374961,"version":"3.54.1"},"reference-count":72,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2018,12,13]],"date-time":"2018-12-13T00:00:00Z","timestamp":1544659200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001459","name":"Ministry of Education - Singapore","doi-asserted-by":"publisher","award":["MOE Academic fund AFD 05\/15 SL"],"award-info":[{"award-number":["MOE Academic fund AFD 05\/15 SL"]}],"id":[{"id":"10.13039\/501100001459","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100007040","name":"Singapore University of Technology and Design","doi-asserted-by":"publisher","award":["SUTD SRG ISTD 2017 129"],"award-info":[{"award-number":["SUTD SRG ISTD 2017 129"]}],"id":[{"id":"10.13039\/501100007040","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Comput &amp; Applic"],"published-print":{"date-parts":[[2020,2]]},"DOI":"10.1007\/s00521-018-3933-z","type":"journal-article","created":{"date-parts":[[2018,12,13]],"date-time":"2018-12-13T05:41:55Z","timestamp":1544679715000},"page":"1037-1050","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":31,"title":["Singing voice separation using a deep convolutional neural network trained by ideal binary mask and cross entropy"],"prefix":"10.1007","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6749-6889","authenticated-orcid":false,"given":"Kin Wah Edward","family":"Lin","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"B. T.","family":"Balamurali","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Enyan","family":"Koh","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Simon","family":"Lui","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dorien","family":"Herremans","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2018,12,13]]},"reference":[{"key":"3933_CR1","unstructured":"Abadi M, Agarwal A, Barham P, Brevdo E, Chen Z, Citro C, Corrado GS, Davis A, Dean J, Devin M, Ghemawat S, Goodfellow I, Harp A, Irving G, Isard M, Jia Y, Jozefowicz R, Kaiser L, Kudlur M, Levenberg J, Man\u00e9 D, Monga R, Moore S, Murray D, Olah C, Schuster M, Shlens J, Steiner B, Sutskever I, Talwar K, Tucker P, Vanhoucke V, Vasudevan V, Vi\u00e9gas F, Vinyals O, Warden P, Wattenberg M, Wicke M, Yu Y, Zheng X (2015) TensorFlow: Large-scale machine learning on heterogeneous systems https:\/\/www.tensorflow.org\/ , software available from https:\/\/www.tensorflow.org"},{"key":"3933_CR2","unstructured":"Bittner RM, Salamon J, Tierney M, Mauch M, Cannam C, Bello JP (2014) Medleydb: a multitrack dataset for annotation-intensive mir research. In: International society for music information retrieval conference (ISMIR). pp 155\u2013160"},{"key":"3933_CR3","volume-title":"Auditory scene analysis: the perceptual organization of sound","author":"AS Bregman","year":"1994","unstructured":"Bregman AS (1994) Auditory scene analysis: the perceptual organization of sound. MIT Press, Cambridge"},{"key":"3933_CR4","unstructured":"Casey M, Westner A (2000) Separation of mixed audio sources by independent subspace analysis. In: International computer music conference (ICMC)"},{"key":"3933_CR5","doi-asserted-by":"crossref","unstructured":"Chan T, Yeh T, Fan Z, Chen H, Su L, Yang Y, Jang R (2015) Vocal activity informed singing voice separation with the ikala dataset. In: IEEE international conference on acoustics, speech and signal processing (ICASSP). pp 718\u2013722","DOI":"10.1109\/ICASSP.2015.7178063"},{"key":"3933_CR6","doi-asserted-by":"crossref","unstructured":"Chandna P, Miron M, Janer J, G\u00f3mez E (2017) Monoaural audio source separation using deep convolutional neural networks. In: International conference on latent variable analysis and signal separation (LVA\/ICA),","DOI":"10.1007\/978-3-319-53547-0_25"},{"issue":"5","key":"3933_CR7","doi-asserted-by":"publisher","first-page":"975","DOI":"10.1121\/1.1907229","volume":"25","author":"EC Cherry","year":"1953","unstructured":"Cherry EC (1953) Some experiments on the recognition of speech, with one and with two ears. J Acoust Soc Am 25(5):975\u2013979","journal-title":"J Acoust Soc Am"},{"key":"3933_CR8","doi-asserted-by":"crossref","unstructured":"Chuan CH, Herremans D (2018) Modeling temporal tonal relations in polyphonic music through deep networks with a novel image-based representation. In: AAAI conference on artificial intelligence (AAAI)","DOI":"10.1609\/aaai.v32i1.11880"},{"key":"3933_CR9","unstructured":"Dessein A, Cont A, Lemaitre G (2010) Real-time polyphonic music transcription with non-negative matrix factorization and beta-divergence. In: International society for music information retrieval conference (ISMIR). pp 489\u2013494"},{"issue":"6","key":"3933_CR10","doi-asserted-by":"publisher","first-page":"1180","DOI":"10.1109\/JSTSP.2011.2158801","volume":"5","author":"JL Durrieu","year":"2011","unstructured":"Durrieu JL, David B, Richard G (2011) A musically motivated mid-level representation for pitch estimation and musical audio source separation. IEEE J Sel Top Signal Process 5(6):1180\u20131191","journal-title":"IEEE J Sel Top Signal Process"},{"key":"3933_CR11","unstructured":"Eggert J, Korner E (2004) Sparse coding and NMF. IEEE international joint conference on neural networks. vol 4, pp 2529\u20132533"},{"key":"3933_CR12","doi-asserted-by":"crossref","unstructured":"Fan ZC, Jang JSR, Lu CL (2016) Singing voice separation and pitch extraction from monaural polyphonic audio music via DNN and adaptive pitch tracking. In: IEEE international conference on multimedia big data (BigMM)","DOI":"10.1109\/BigMM.2016.56"},{"key":"3933_CR13","unstructured":"Fan ZC, Lai YL, Jang JSR (2017) Svsgan: singing voice separation via generative adversarial network. In: arXiv:1710.11428"},{"issue":"3","key":"3933_CR14","doi-asserted-by":"publisher","first-page":"793","DOI":"10.1162\/neco.2008.04-08-771","volume":"21","author":"C F\u00e9votte","year":"2009","unstructured":"F\u00e9votte C, Bertin N, Durrieu JL (2009) Nonnegative matrix factorization with the itakura-saito divergence: with application to music analysis. Neural Comput 21(3):793\u2013830","journal-title":"Neural Comput"},{"issue":"1","key":"3933_CR15","first-page":"62","volume":"4","author":"D FitzGerald","year":"2010","unstructured":"FitzGerald D, Gainza M (2010) Single channel vocal separation using median filtering and factorisation techniques. ISAST Trans Electr Signal Process 4(1):62\u201373","journal-title":"ISAST Trans Electr Signal Process"},{"issue":"3","key":"3933_CR16","doi-asserted-by":"publisher","first-page":"638","DOI":"10.1109\/TASL.2010.2041386","volume":"18","author":"H Fujihara","year":"2010","unstructured":"Fujihara H, Goto M, Kitahara T, Okuno HG (2010) A modeling of singing voice robust to accompaniment sounds and its application to singer identification and vocal-timbre-similarity-based music information retrieval. IEEE Trans Audio Speech Lang Process 18(3):638\u2013648","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"3933_CR17","unstructured":"Glorot X, Bengio Y (2010) Understanding the difficulty of training deep feedforward neural networks. In: International conference on artificial intelligence and statistics"},{"key":"3933_CR18","unstructured":"Grais EM, Roma G, Simpson AJR, Plumbley MD (2016) Single-channel audio source separation using deep neural network ensembles. In: Audio engineering society convention 140"},{"issue":"5","key":"3933_CR19","doi-asserted-by":"publisher","first-page":"69:1","DOI":"10.1145\/3108242","volume":"50","author":"D Herremans","year":"2017","unstructured":"Herremans D, Chuan CH, Chew E (2017) A functional taxonomy of music generation systems. ACM Comput Surv 50(5):69:1\u201369:30","journal-title":"ACM Comput Surv"},{"issue":"2","key":"3933_CR20","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1016\/0893-6080(91)90009-T","volume":"4","author":"K Hornik","year":"1991","unstructured":"Hornik K (1991) Approximation capabilities of multilayer feedforward networks. Neural Netw 4(2):251\u2013257","journal-title":"Neural Netw"},{"key":"3933_CR21","doi-asserted-by":"crossref","unstructured":"Huang G, Liu Z, van der Maaten L, Weinberger KQ (2017) Densely connected convolutional networks. In: The IEEE conference on computer vision and pattern recognition (CVPR)","DOI":"10.1109\/CVPR.2017.243"},{"key":"3933_CR22","unstructured":"Huang PS, Kim M, Hasegawa-Johnson M, Smaragdis P (2014) Singing-voice separation from monaural recordings using deep recurrent neural networks. In: International society for music information retrieval conference (ISMIR). pp 477\u2013482"},{"issue":"12","key":"3933_CR23","doi-asserted-by":"publisher","first-page":"2136","DOI":"10.1109\/TASLP.2015.2468583","volume":"23","author":"PS Huang","year":"2015","unstructured":"Huang PS, Kim M, Hasegawa-Johnson M, Smaragdis P (2015) Joint optimization of masks and deep recurrent neural networks for monaural source separation. IEEE\/ACM Trans Audio Speech Lang Process 23(12):2136\u20132147","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"3933_CR24","doi-asserted-by":"crossref","unstructured":"Huang P, Chen S, Smaragdis P, Hasegawa-Johnson M (Mar 2012) Singing-voice separation from monaural recordings using robust principal component analysis. In: IEEE international conference on acoustics, speech and signal processing (ICASSP). pp 57\u201360","DOI":"10.1109\/ICASSP.2012.6287816"},{"key":"3933_CR25","unstructured":"Humphrey E, Montecchio N, Bittner R, Jansson A, Jehan T (2017) Mining labeled data from web-scale collections for vocal activity detection in music. In: Proceedings of the 18th ISMIR conference"},{"issue":"11","key":"3933_CR26","doi-asserted-by":"publisher","first-page":"2084","DOI":"10.1109\/TASLP.2016.2577879","volume":"24","author":"Y Ikemiya","year":"2016","unstructured":"Ikemiya Y, Itoyama K, Yoshii K (2016) Singing voice separation and vocal F0 estimation based on mutual combination of robust principal component analysis and subharmonic summation. IEEE\/ACM Trans Audio Speech Lang Process 24(11):2084\u20132095","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"3933_CR27","unstructured":"Ioffe S, Szegedy C (2015) Batch normalization: accelerating deep network training by reducing internal covariate shift. In: International conference on machine learning (ICML). pp 448\u2013456"},{"key":"3933_CR28","unstructured":"Jansson A, Humphrey E, Montecchio N, Bittner R, Kumar A, Weyde T (2017) Singing voice separation with deep u-net convolutional networks. In: International society for music information retrieval conference (ISMIR). pp 745\u2013751"},{"issue":"10","key":"3933_CR29","doi-asserted-by":"publisher","first-page":"1197","DOI":"10.1109\/LSP.2014.2329946","volume":"21","author":"IY Jeong","year":"2014","unstructured":"Jeong IY, Lee K (2014) Vocal separation from monaural music using temporal\/spectral continuity and sparsity constraints. IEEE Signal Process Lett 21(10):1197\u20131200","journal-title":"IEEE Signal Process Lett"},{"key":"3933_CR30","doi-asserted-by":"publisher","first-page":"553","DOI":"10.1007\/978-3-319-53547-0_52","volume-title":"Latent Variable Analysis and Signal Separation","author":"Il-Young Jeong","year":"2017","unstructured":"Jeong IY, Lee K (2017) Singing voice separation using rpca with weighted l1-norm. In: International conference on latent variable analysis and signal separation (LVA\/ICA). Springer, Berlin, pp 553\u2013562"},{"key":"3933_CR31","unstructured":"Kingma D, Ba J (2014) Adam: a method for stochastic optimization. arXiv:1412.6980"},{"key":"3933_CR32","unstructured":"Krizhevsky A, Sutskever I, Hinton GE (2012) Imagenet classification with deep convolutional neural networks. In: Advances in neural information processing systems. pp 1097\u20131105"},{"key":"3933_CR33","unstructured":"Lee DD, Seung HS (2001) Algorithms for non-negative matrix factorization. In: Advances in neural information processing systems. pp 556\u2013562"},{"key":"3933_CR34","doi-asserted-by":"crossref","unstructured":"Lin KWE, Anderson H, Agus N, So C, Lui S (2014a) Visualising singing style under common musical events using pitch-dynamics trajectories and modified traclus clustering. In: International conference on machine learning and applications (ICMLA). pp 237\u2013242","DOI":"10.1109\/ICMLA.2014.44"},{"key":"3933_CR35","unstructured":"Lin KWE, Anderson H, Hamzeen M, Lui S (2014b) Implementation and evaluation of real-time interactive user interface design in self-learning singing pitch training apps. In: Joint proceedings of international computer music conference (ICMC) and sound and music computing conference (SMC)"},{"key":"3933_CR36","doi-asserted-by":"crossref","unstructured":"Lin KWE, Anderson H, So C, Lui S (2017) Sinusoidal partials tracking for singing analysis using the heuristic of the minimal frequency and magnitude difference. In: Interspeech. pp 3038\u20133042","DOI":"10.21437\/Interspeech.2017-17"},{"key":"3933_CR37","doi-asserted-by":"crossref","unstructured":"Lin KWE, Feng T, Agus N, So C, Lui S (2014c) Modelling mutual information between voiceprint and optimal number of mel-frequency cepstral coefficients in voice discrimination. In: International conference on machine learning and applications (ICMLA). pp 15\u201320","DOI":"10.1109\/ICMLA.2014.9"},{"key":"3933_CR38","unstructured":"Lin Z, Chen M, Ma Y (2010) The augmented lagrange multiplier method for exact recovery of corrupted low-rank matrices. Tech. rep., UILU-ENG-09-2214, UIUC"},{"key":"3933_CR39","doi-asserted-by":"crossref","unstructured":"Liutkus A, Fitzgerald D, Rafii Z (2015) Scalable audio separation with light kernel additive modelling. In: IEEE International conference on acoustics, speech and signal processing (ICASSP). pp 76\u201380","DOI":"10.1109\/ICASSP.2015.7177935"},{"key":"3933_CR40","doi-asserted-by":"crossref","unstructured":"Liutkus A, Rafii Z, Badeau R, Pardo B, Richard G (2012) Adaptive filtering for music\/voice separation exploiting the repeating musical structure. In: IEEE international conference on acoustics, speech and signal processing (ICASSP). pp 53\u201356","DOI":"10.1109\/ICASSP.2012.6287815"},{"key":"3933_CR41","doi-asserted-by":"publisher","first-page":"323","DOI":"10.1007\/978-3-319-53547-0_31","volume-title":"Latent Variable Analysis and Signal Separation","author":"Antoine Liutkus","year":"2017","unstructured":"Liutkus A, St\u00f6ter FR, Rafii Z, Kitamura D, Rivet B, Ito N, Ono N, Fontecave J (2017) The 2016 signal separation evaluation campaign. In: International conference on latent variable analysis and signal separation (LVA\/ICA). Springer, Berlin, pp 323\u2013332"},{"key":"3933_CR42","doi-asserted-by":"crossref","unstructured":"Long J, Shelhamer E, Darrell T (2015) Fully convolutional networks for semantic segmentation. In: IEEE Conference on computer vision and pattern recognition (CVPR)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"3933_CR43","unstructured":"Loughran R, Walker J, O\u2019Neill M, O\u2019Farrell M (2008) The use of mel-frequency cepstral coefficients in musical instrument identification. In: International computer music conference (ICMC)"},{"key":"3933_CR44","doi-asserted-by":"crossref","unstructured":"Luo Y, Chen Z, Hershey JR, Roux JL, Mesgarani N (2017) Deep clustering and conventional networks for music separation: Stronger together. In: IEEE international conference on acoustics, speech and signal processing (ICASSP). pp 61\u201365","DOI":"10.1109\/ICASSP.2017.7952118"},{"key":"3933_CR45","unstructured":"Mauch M, Fujihara H, Yoshii K, Goto M (2011) Timbre and melody features for the recognition of vocal activity and instrumental solos in polyphonic music. In: International society for music information retrieval conference (ISMIR). pp 233\u2013238"},{"key":"3933_CR46","doi-asserted-by":"publisher","first-page":"546047","DOI":"10.1186\/1687-4722-2010-546047","volume":"1","author":"A Mesaros","year":"2010","unstructured":"Mesaros A, Virtanen T (2010) Automatic recognition of lyrics in singing. EURASIP J Audio Speech Music Process 1:546047","journal-title":"EURASIP J Audio Speech Music Process"},{"key":"3933_CR47","volume-title":"Neural networks and deep learning","author":"MA Nielsen","year":"2015","unstructured":"Nielsen MA (2015) Neural networks and deep learning. Determination Press, New York"},{"issue":"9","key":"3933_CR48","doi-asserted-by":"publisher","first-page":"1652","DOI":"10.1109\/TASLP.2016.2580946","volume":"24","author":"AA Nugraha","year":"2016","unstructured":"Nugraha AA, Liutkus A, Vincent E (2016) Multichannel audio source separation with deep neural networks. IEEE\/ACM Trans Audio Speech Lang Process 24(9):1652\u20131664","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"3933_CR49","unstructured":"Oh SJ, Benenson R, Khoreva A, Akata Z, Fritz M, Schiele B (2017) Exploiting saliency for object segmentation from image level labels. In: IEEE conference on computer vision and pattern recognition (CVPR). pp 4410\u20134419"},{"key":"3933_CR50","unstructured":"den Oord AV, Dieleman S, Schrauwen B (2013) Deep content-based music recommendation. In: Advances in neural information processing systems. pp 2643\u20132651"},{"key":"3933_CR51","volume-title":"Discrete-time signal processing","author":"AV Oppenheim","year":"2009","unstructured":"Oppenheim AV, Schafer RW (2009) Discrete-time signal processing, 3rd edn. Prentice Hall Press, Upper Saddle River","edition":"3"},{"issue":"4","key":"3933_CR52","doi-asserted-by":"publisher","first-page":"1118","DOI":"10.1109\/TASL.2011.2172425","volume":"20","author":"A Ozerov","year":"2012","unstructured":"Ozerov A, Vincent E, Bimbot F (2012) A general flexible framework for the handling of prior information in audio source separation. IEEE Trans Audio Speech Lang Process 20(4):1118\u20131133","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"3933_CR53","unstructured":"Rafii Z, Pardo B (2012) Music\/voice separation using the similarity matrix. In: International society for music information retrieval conference (ISMIR). pp 583\u2013588"},{"issue":"1","key":"3933_CR54","doi-asserted-by":"publisher","first-page":"73","DOI":"10.1109\/TASL.2012.2213249","volume":"21","author":"Z Rafii","year":"2013","unstructured":"Rafii Z, Pardo B (2013) Repeating pattern extraction technique (repet): a simple method for music\/voice separation. IEEE Trans Audio Speech Lang Process 21(1):73\u201384","journal-title":"IEEE Trans Audio Speech Lang Process"},{"issue":"8","key":"3933_CR55","doi-asserted-by":"publisher","first-page":"1307","DOI":"10.1109\/TASLP.2018.2825440","volume":"26","author":"Z Rafii","year":"2018","unstructured":"Rafii Z, Liutkus A, Stoter FR, Mimilakis SI, FitzGerald D, Pardo B (2018) An overview of lead and accompaniment separation in music. IEEE\/ACM Trans Audio Speech Lang Process (TASLP) 26(8):1307\u20131335","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process (TASLP)"},{"key":"3933_CR56","unstructured":"Salamon J, Bittner R, Bonada J, Bosch JJ, G\u00f3mez E, Bello JP (2017) An analysis\/synthesis framework for automatic F0 annotation of multitrack datasets. In: International society for music information retrieval conference (ISMIR)"},{"key":"3933_CR57","unstructured":"Schl\u00fcter J (2016) Learning to pinpoint singing voice from weakly labeled examples. In: International society for music information retrieval conference (ISMIR). pp 44\u201350"},{"key":"3933_CR58","doi-asserted-by":"crossref","unstructured":"Simpson AJR, Roma G, Grais EM, Mason RD, Hummersone C, Liutkus A, Plumbley MD (2016) Evaluation of audio source separation models using hypothesis-driven non-parametric statistical methods. In: European signal processing conference (EUSIPCO). pp 1763\u20131767","DOI":"10.1109\/EUSIPCO.2016.7760551"},{"key":"3933_CR59","doi-asserted-by":"publisher","first-page":"429","DOI":"10.1007\/978-3-319-22482-4_50","volume-title":"Latent Variable Analysis and Signal Separation","author":"Andrew J. R. Simpson","year":"2015","unstructured":"Simpson AJ, Roma G, Plumbley MD (2015) Deep karaoke: extracting vocals from musical mixtures using a convolutional deep neural network. In: International conference on latent variable analysis and signal separation (LVA\/ICA). pp 429\u2013436"},{"issue":"1","key":"3933_CR60","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava N, Hinton GE, Krizhevsky A, Sutskever I, Salakhutdinov R (2014) Dropout: a simple way to prevent neural networks from overfitting. J Mach Learn Res 15(1):1929\u20131958","journal-title":"J Mach Learn Res"},{"key":"3933_CR61","unstructured":"Stoller D, Ewert S, Dixon S (2017) Adversarial semi-supervised audio source separation applied to singing voice extraction. arXiv:1711.00048"},{"key":"3933_CR62","doi-asserted-by":"publisher","first-page":"329","DOI":"10.1007\/978-3-319-93764-9_31","volume-title":"Latent Variable Analysis and Signal Separation","author":"Daniel Stoller","year":"2018","unstructured":"Stoller D, Ewert S, Dixon S (2018) Jointly detecting and separating singing voice: a multi-task approach. In: International conference on latent variable analysis and signal separation. Springer, Berlin, pp 329\u2013339"},{"key":"3933_CR63","unstructured":"Stter FR, Liutkus A, Badeau R, Edler B, Magron P (2016) Common fate model for unison source separation. In: IEEE International conference on acoustics, speech and signal processing (ICASSP). pp 126\u2013130"},{"key":"3933_CR64","unstructured":"Sturm BL, Morvidone M, Daudet L (2010) Musical instrument identification using multiscale mel-frequency cepstral coefficients. In: European signal processing conference. pp 477\u2013481"},{"key":"3933_CR65","doi-asserted-by":"crossref","unstructured":"Uhlich S, Giron F, Mitsufuji Y (2015) Deep neural network based instrument extraction from music. In: IEEE international conference on acoustics, speech and signal processing (ICASSP). pp 2135\u20132139","DOI":"10.1109\/ICASSP.2015.7178348"},{"key":"3933_CR66","doi-asserted-by":"crossref","unstructured":"Uhlich S, Porcu M, Giron F, Enenkl M, Kemp T, Takahashi N, Mitsufuji Y (2017) Improving music source separation based on deep neural networks through data augmentation and network blending. In: IEEE international conference on acoustics, speech and signal processing (ICASSP). pp 261\u2013265","DOI":"10.1109\/ICASSP.2017.7952158"},{"key":"3933_CR67","unstructured":"Vembu S, Baumann S (2005) Separation of vocals from polyphonic audio recordings. In: International society for music information retrieval conference (ISMIR). pp 337\u2013344"},{"issue":"4","key":"3933_CR68","doi-asserted-by":"publisher","first-page":"1462","DOI":"10.1109\/TSA.2005.858005","volume":"14","author":"E Vincent","year":"2006","unstructured":"Vincent E, Gribonval R, Fevotte C (2006) Performance measurement in blind audio source separation. IEEE Trans Audio Speech Lang Process 14(4):1462\u20131469","journal-title":"IEEE Trans Audio Speech Lang Process"},{"issue":"3","key":"3933_CR69","doi-asserted-by":"publisher","first-page":"1066","DOI":"10.1109\/TASL.2006.885253","volume":"15","author":"T Virtanen","year":"2007","unstructured":"Virtanen T (2007) Monaural sound source separation by nonnegative matrix factorization with temporal continuity and sparseness criteria. IEEE Trans Audio Speech Lang Process 15(3):1066\u20131074","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"3933_CR70","first-page":"181","volume-title":"On ideal binary mask as the computational goal of auditory scene analysis","author":"D Wang","year":"2005","unstructured":"Wang D (2005) On ideal binary mask as the computational goal of auditory scene analysis. Springer, New York, pp 181\u2013197"},{"key":"3933_CR71","doi-asserted-by":"crossref","unstructured":"Wang Y, Kan MY, Nwe TL, Shenoy A, Yin J (2004) Lyrically: automatic synchronization of acoustic musical signals and textual lyrics. In: ACM international conference on multimedia. ACM, Cambridge, pp 212\u2013219","DOI":"10.1145\/1027527.1027576"},{"issue":"12","key":"3933_CR72","doi-asserted-by":"publisher","first-page":"1849","DOI":"10.1109\/TASLP.2014.2352935","volume":"22","author":"Y Wang","year":"2014","unstructured":"Wang Y, Narayanan A, Wang D (2014) On training targets for supervised speech separation. IEEE\/ACM Trans Audio Speech Lang Process (TASLP) 22(12):1849\u20131858","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process (TASLP)"}],"container-title":["Neural Computing and Applications"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-018-3933-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s00521-018-3933-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-018-3933-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,9,8]],"date-time":"2022-09-08T04:02:18Z","timestamp":1662609738000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s00521-018-3933-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,12,13]]},"references-count":72,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2020,2]]}},"alternative-id":["3933"],"URL":"https:\/\/doi.org\/10.1007\/s00521-018-3933-z","relation":{},"ISSN":["0941-0643","1433-3058"],"issn-type":[{"value":"0941-0643","type":"print"},{"value":"1433-3058","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018,12,13]]},"assertion":[{"value":"16 December 2017","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 December 2018","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 December 2018","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Compliance with ethical standards"}},{"value":"The authors of this manuscript certify that they have NO affiliations with or involvement in any organization or entity with any financial interest (such as honoraria; educational grants; participation in speakers bureaus; membership, employment, consultancies, stock ownership, or other equity interest; and expert testimony or patent-licensing arrangements) or non-financial interest (such as personal or professional relationships, affiliations, knowledge or beliefs) in the subject matter or materials discussed in this manuscript.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}