{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,6]],"date-time":"2025-11-06T12:30:07Z","timestamp":1762432207828,"version":"3.37.3"},"reference-count":47,"publisher":"Springer Science and Business Media LLC","issue":"13","license":[{"start":{"date-parts":[[2022,3,9]],"date-time":"2022-03-09T00:00:00Z","timestamp":1646784000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,3,9]],"date-time":"2022-03-09T00:00:00Z","timestamp":1646784000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2022,5]]},"DOI":"10.1007\/s11042-022-12632-6","type":"journal-article","created":{"date-parts":[[2022,3,9]],"date-time":"2022-03-09T16:20:44Z","timestamp":1646842844000},"page":"18617-18639","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Speech enhancement using U-nets with wide-context units"],"prefix":"10.1007","volume":"81","author":[{"given":"Tomasz","family":"Grzywalski","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4603-8894","authenticated-orcid":false,"given":"Szymon","family":"Drgas","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,3,9]]},"reference":[{"key":"12632_CR1","doi-asserted-by":"crossref","unstructured":"Ananthakrishnan KS, Dogancay K (2009) Recent trends and challenges in speech-separation systems research\u2014-A tutorial review. In: TENCON 2009-2009 IEEE region 10 conference, pp 1\u20136. IEEE","DOI":"10.1109\/TENCON.2009.5396022"},{"key":"12632_CR2","unstructured":"Bai S, Kolter JZ, Koltun V (2018) An empirical evaluation of generic convolutional and recurrent networks for sequence modeling. arXiv:1803.01271"},{"issue":"6","key":"12632_CR3","doi-asserted-by":"publisher","first-page":"4705","DOI":"10.1121\/1.4986931","volume":"141","author":"J Chen","year":"2017","unstructured":"Chen J, Wang D (2017) Long short-term memory for speaker generalization in supervised speech separation. J Acoust Soc Am 141(6):4705\u20134714","journal-title":"J Acoust Soc Am"},{"issue":"6","key":"12632_CR4","doi-asserted-by":"publisher","first-page":"1109","DOI":"10.1109\/TASSP.1984.1164453","volume":"32","author":"Y Ephraim","year":"1984","unstructured":"Ephraim Y, Malah D (1984) Speech enhancement using a minimum-mean square error short-time spectral amplitude estimator. IEEE Trans Acoust Speech Signal Process 32(6):1109\u20131121. https:\/\/doi.org\/10.1109\/TASSP.1984.1164453","journal-title":"IEEE Trans Acoust Speech Signal Process"},{"key":"12632_CR5","doi-asserted-by":"crossref","unstructured":"Erdogan H, Hershey JR, Watanabe S, Le Roux J (2015) Phase-sensitive and recognition-boosted speech separation using deep recurrent neural networks. In: 2015 IEEE International conference on acoustics, speech and signal processing (ICASSP), pp 708\u2013712. IEEE","DOI":"10.1109\/ICASSP.2015.7178061"},{"key":"12632_CR6","unstructured":"Freesound. https:\/\/freesound.org\/. Accessed: 2019-09-28"},{"key":"12632_CR7","volume-title":"Csr-i (wsj0) complete","author":"J Garofalo","year":"2007","unstructured":"Garofalo J, Graff D, Paul D, Pallett D (2007) Csr-i (wsj0) complete. Linguistic Data Consortium, Philadelphia"},{"key":"12632_CR8","unstructured":"Garofolo JS (1993) Timit acoustic phonetic continuous speech corpus. Linguistic Data Consortium, 1993"},{"key":"12632_CR9","doi-asserted-by":"crossref","unstructured":"Grais EM, Plumbley MD (2017) Single channel audio source separation using convolutional denoising autoencoders. arXiv:1703.08019","DOI":"10.1109\/GlobalSIP.2017.8309164"},{"key":"12632_CR10","doi-asserted-by":"crossref","unstructured":"Grzywalski T, Drgas S (2018) Application of recurrent u-net architecture to speech enhancement. In: Signal processing: algorithms, architectures, arrangements, and applications (SPA), pp 82\u201387. IEEE","DOI":"10.23919\/SPA.2018.8563364"},{"key":"12632_CR11","doi-asserted-by":"crossref","unstructured":"Grzywalski T, Drgas S (2019) Using recurrences in time and frequency within u-net architecture for speech enhancement. In: ICASSP 2019-2019 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 6970\u20136974. IEEE","DOI":"10.1109\/ICASSP.2019.8682830"},{"issue":"06","key":"12632_CR12","doi-asserted-by":"publisher","first-page":"2030001","DOI":"10.1142\/S0219691320300017","volume":"18","author":"RC Guido","year":"2020","unstructured":"Guido RC, Pedroso F, Furlan A, Contreras RC, Caobianco LG, Neto JS (2020) Cwt\u00d7 dwt\u00d7 dtwt\u00d7 sdtwt: clarifying terminologies and roles of different types of wavelet transforms. Int J Wavelets Multiresol Inform Proces 18(06):2030001","journal-title":"Int J Wavelets Multiresol Inform Proces"},{"key":"12632_CR13","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Identity mappings in deep residual networks. arXiv:1603.05027","DOI":"10.1007\/978-3-319-46493-0_38"},{"key":"12632_CR14","doi-asserted-by":"crossref","unstructured":"Huang PS, Kim M, Hasegawa-Johnson M, Smaragdis P (2014) Deep learning for monaural speech separation. In: 2014 IEEE International conference on acoustics, speech and signal processing (ICASSP), pp 1562\u20131566. IEEE","DOI":"10.1109\/ICASSP.2014.6853860"},{"issue":"12","key":"12632_CR15","doi-asserted-by":"publisher","first-page":"2136","DOI":"10.1109\/TASLP.2015.2468583","volume":"23","author":"PS Huang","year":"2015","unstructured":"Huang PS, Kim M, Hasegawa-Johnson M, Smaragdis P (2015) Joint optimization of masks and deep recurrent neural networks for monaural source separation. IEEE\/ACM Trans Audio Speech Lang Process 23(12):2136\u20132147","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"12632_CR16","doi-asserted-by":"publisher","unstructured":"Hui L, Cai M, Guo C, He L, Zhang WQ, Liu J (2015) Convolutional maxout neural networks for speech separation. In: 2015 IEEE international symposium on signal processing and information technology (ISSPIT), pp 24\u201327. https:\/\/doi.org\/10.1109\/ISSPIT.2015.7394335","DOI":"10.1109\/ISSPIT.2015.7394335"},{"key":"12632_CR17","unstructured":"Kingma DP, Ba J (2014) Adam: a method for stochastic optimization, arXiv:1412.6980"},{"key":"12632_CR18","doi-asserted-by":"crossref","unstructured":"Le Roux J, Wichern G, Watanabe S, Sarroff A, Hershey JR (2019) The phasebook: building complex masks via discrete representations for source separation. In: ICASSP 2019-2019 IEEE International conference on acoustics, speech and signal processing (ICASSP), pp 66\u201370. IEEE","DOI":"10.1109\/ICASSP.2019.8682587"},{"key":"12632_CR19","doi-asserted-by":"crossref","unstructured":"Le Roux J, Wisdom S, Erdogan H, Hershey JR (2019) Sdr\u2013half-baked or well done?. In: ICASSP 2019-2019 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 626\u2013630. IEEE","DOI":"10.1109\/ICASSP.2019.8683855"},{"key":"12632_CR20","doi-asserted-by":"crossref","unstructured":"Luo Y, Mesgarani N (2018) Tasnet: surpassing ideal time-frequency masking for speech separation, arXiv:1809.07454","DOI":"10.1109\/TASLP.2019.2915167"},{"key":"12632_CR21","doi-asserted-by":"crossref","unstructured":"Mesaros A, Heittola T, Virtanen T (2016) Tut database for acoustic scene classification and sound event detection. In: 2016 24th European signal processing conference (EUSIPCO), pp 1128\u20131132. IEEE","DOI":"10.1109\/EUSIPCO.2016.7760424"},{"key":"12632_CR22","doi-asserted-by":"crossref","unstructured":"Mowlaee P, Saeidi R, Christensen MG, Martin R (2012) Subjective and objective quality assessment of single-channel speech separation algorithms. In: 2012 IEEE International conference on acoustics, speech and signal processing (ICASSP), pp 69\u201372. IEEE","DOI":"10.1109\/ICASSP.2012.6287819"},{"key":"12632_CR23","doi-asserted-by":"publisher","first-page":"44","DOI":"10.1016\/j.specom.2019.06.002","volume":"111","author":"A Nicolson","year":"2019","unstructured":"Nicolson A, Paliwal KK (2019) Deep learning for minimum mean-square error approaches to speech enhancement. Speech Commun 111:44\u201355. https:\/\/doi.org\/10.1016\/j.specom.2019.06.002, https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0167639318304308","journal-title":"Speech Commun"},{"key":"12632_CR24","doi-asserted-by":"publisher","first-page":"80","DOI":"10.1016\/j.specom.2020.10.004","volume":"125","author":"A Nicolson","year":"2020","unstructured":"Nicolson A, Paliwal KK (2020) Masked multi-head self-attention for causal speech enhancement. Speech Comm 125:80\u201396","journal-title":"Speech Comm"},{"issue":"7","key":"12632_CR25","doi-asserted-by":"publisher","first-page":"1179","DOI":"10.1109\/TASLP.2019.2913512","volume":"27","author":"A Pandey","year":"2019","unstructured":"Pandey A, Wang D (2019) A new framework for cnn-based speech enhancement in the time domain. IEEE\/ACM Trans Audio Speech Lang Process 27 (7):1179\u20131188","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"12632_CR26","doi-asserted-by":"crossref","unstructured":"Pandey A, Wang D (2019) Tcnn: temporal convolutional neural network for real-time speech enhancement in the time domain. In: ICASSP 2019-2019 IEEE International conference on acoustics, speech and signal processing (ICASSP), pp 6875\u20136879. IEEE","DOI":"10.1109\/ICASSP.2019.8683634"},{"key":"12632_CR27","doi-asserted-by":"crossref","unstructured":"Park SR, Lee J (2016) A fully convolutional neural network for speech enhancement, arXiv:1609.07132","DOI":"10.21437\/Interspeech.2017-1465"},{"key":"12632_CR28","doi-asserted-by":"publisher","unstructured":"Pirhosseinloo S, Brumberg JS (2019) Monaural speech enhancement with dilated convolutions. In: Proc. Interspeech 2019, pp 3143\u20133147. https:\/\/doi.org\/10.21437\/Interspeech.2019-2782","DOI":"10.21437\/Interspeech.2019-2782"},{"key":"12632_CR29","doi-asserted-by":"crossref","unstructured":"Rix AW, Beerends JG, Hollier MP, Hekstra AP (2001) Perceptual evaluation of speech quality (pesq)-a new method for speech quality assessment of telephone networks and codecs. In: 2001 IEEE International conference on acoustics, speech, and signal processing. Proceedings (Cat. No. 01CH37221), vol 2, pp 749\u2013752. IEEE","DOI":"10.1109\/ICASSP.2001.941023"},{"key":"12632_CR30","doi-asserted-by":"crossref","unstructured":"Ronneberger O, Fischer P, Brox T (2015) U-net: convolutional networks for biomedical image segmentation. In: International conference on medical image computing and computer-assisted intervention, pp 234\u2013241. Springer","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"12632_CR31","unstructured":"Stowell D, Plumbley MD (2013) An open dataset for research on audio field recording archives: freefield1010. arXiv:1309.5275"},{"issue":"2","key":"12632_CR32","doi-asserted-by":"publisher","first-page":"359","DOI":"10.1109\/JSTSP.2019.2908760","volume":"13","author":"Y Sun","year":"2019","unstructured":"Sun Y, Xian Y, Wang W, Naqvi SM (2019) Monaural source separation in complex domain with long short-term memory neural network. IEEE J Selected Topics Signal Process 13(2):359\u2013369","journal-title":"IEEE J Selected Topics Signal Process"},{"issue":"7","key":"12632_CR33","doi-asserted-by":"publisher","first-page":"2125","DOI":"10.1109\/TASL.2011.2114881","volume":"19","author":"CH Taal","year":"2011","unstructured":"Taal CH, Hendriks RC, Heusdens R, Jensen J (2011) An algorithm for intelligibility prediction of time\u2013frequency weighted noisy speech. IEEE Trans Audio Speech Lang Process 19(7):2125\u20132136","journal-title":"IEEE Trans Audio Speech Lang Process"},{"issue":"1","key":"12632_CR34","doi-asserted-by":"publisher","first-page":"189","DOI":"10.1109\/TASLP.2018.2876171","volume":"27","author":"K Tan","year":"2019","unstructured":"Tan K, Chen J, Wang D (2019) Gated residual networks with dilated convolutions for monaural speech enhancement. IEEE\/ACM Trans Audio Speech Lang Process 27(1):189\u2013198","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"12632_CR35","doi-asserted-by":"crossref","unstructured":"Tan K, Wang D (2018) A convolutional recurrent neural network for real-time speech enhancement. In: Interspeech, pp 3229\u20133233","DOI":"10.21437\/Interspeech.2018-1405"},{"issue":"3","key":"12632_CR36","doi-asserted-by":"publisher","first-page":"247","DOI":"10.1016\/0167-6393(93)90095-3","volume":"12","author":"A Varga","year":"1993","unstructured":"Varga A, Steeneken HJ (1993) Assessment for automatic speech recognition: Ii. noisex-92: A database and an experiment to study the effect of additive noise on speech recognition systems. Speech Commun 12(3):247\u2013251","journal-title":"Speech Commun"},{"issue":"3","key":"12632_CR37","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1109\/MSPEC.2017.7864754","volume":"54","author":"D Wang","year":"2017","unstructured":"Wang D (2017) Deep learning reinvents the hearing aid. IEEE Spectr 54(3):32\u201337","journal-title":"IEEE Spectr"},{"key":"12632_CR38","doi-asserted-by":"crossref","unstructured":"Wang D, Chen J (2018) Supervised speech separation based on deep learning: an overview. IEEE\/ACM Transactions on audio, speech, and language processing","DOI":"10.1109\/TASLP.2018.2842159"},{"issue":"12","key":"12632_CR39","doi-asserted-by":"publisher","first-page":"1849","DOI":"10.1109\/TASLP.2014.2352935","volume":"22","author":"Y Wang","year":"2014","unstructured":"Wang Y, Narayanan A, Wang D (2014) On training targets for supervised speech separation. IEEE\/ACM Trans Audio Speech Lang Process (TASLP) 22 (12):1849\u20131858","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process (TASLP)"},{"issue":"7","key":"12632_CR40","doi-asserted-by":"publisher","first-page":"1381","DOI":"10.1109\/TASL.2013.2250961","volume":"21","author":"Y Wang","year":"2013","unstructured":"Wang Y, Wang D (2013) Towards scaling up classification-based speech separation. IEEE Trans Audio Speech Lang Process 21(7):1381\u20131390","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"12632_CR41","doi-asserted-by":"crossref","unstructured":"Wang Y, Wang D (2014) A structure-preserving training target for supervised speech separation. In: 2014 IEEE International conference on acoustics, speech and signal processing (ICASSP), pp 6107\u20136111. IEEE","DOI":"10.1109\/ICASSP.2014.6854777"},{"key":"12632_CR42","doi-asserted-by":"crossref","unstructured":"Wang ZQ, Roux JL, Wang D, Hershey JR (2018) End-to-end speech separation with unfolded iterative phase reconstruction, arXiv:1804.10204","DOI":"10.21437\/Interspeech.2018-1629"},{"key":"12632_CR43","doi-asserted-by":"crossref","unstructured":"Wang ZQ, Tan K, Wang D (2019) Deep learning based phase reconstruction for speaker separation: a trigonometric perspective. In: ICASSP 2019-2019 IEEE International conference on acoustics, speech and signal processing (ICASSP), pp 71\u201375. IEEE","DOI":"10.1109\/ICASSP.2019.8683231"},{"issue":"7","key":"12632_CR44","doi-asserted-by":"publisher","first-page":"1492","DOI":"10.1109\/TASLP.2017.2696307","volume":"25","author":"DS Williamson","year":"2017","unstructured":"Williamson DS, Wang D (2017) Time-frequency masking in the complex domain for speech dereverberation and denoising. IEEE\/ACM Trans Audio Speech Lang Process 25(7):1492\u20131501","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"12632_CR45","unstructured":"Yu F, Koltun V (2015) Multi-scale context aggregation by dilated convolutions, arXiv:1511.07122"},{"key":"12632_CR46","doi-asserted-by":"publisher","first-page":"75","DOI":"10.1016\/j.specom.2020.09.002","volume":"124","author":"W Yuan","year":"2020","unstructured":"Yuan W (2020) A time\u2013frequency smoothing neural network for speech enhancement. Speech Commun 124:75\u201384. https:\/\/doi.org\/10.1016\/j.specom.2020.09.002, https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0167639320302703","journal-title":"Speech Commun"},{"key":"12632_CR47","unstructured":"Zhang R (2019) Making convolutional networks shift-invariant again, arXiv:1904.11486"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-022-12632-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-022-12632-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-022-12632-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,20]],"date-time":"2024-09-20T00:10:24Z","timestamp":1726791024000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-022-12632-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,3,9]]},"references-count":47,"journal-issue":{"issue":"13","published-print":{"date-parts":[[2022,5]]}},"alternative-id":["12632"],"URL":"https:\/\/doi.org\/10.1007\/s11042-022-12632-6","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"type":"print","value":"1380-7501"},{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2022,3,9]]},"assertion":[{"value":"1 December 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 March 2021","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 February 2022","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 March 2022","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}