{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T15:52:28Z","timestamp":1781193148540,"version":"3.54.1"},"reference-count":86,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100001381","name":"National Research Foundation Singapore","doi-asserted-by":"publisher","award":["AISG-100E-2018-006"],"award-info":[{"award-number":["AISG-100E-2018-006"]}],"id":[{"id":"10.13039\/501100001381","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Advanced Manufacturing and Engineering, Research, Innovation and Enterprise 2020 Programmatic Fund","award":["A1687b0033"],"award-info":[{"award-number":["A1687b0033"]}]},{"name":"U Bremen Excellence Chairs Program, Germany"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2020]]},"DOI":"10.1109\/taslp.2020.2987429","type":"journal-article","created":{"date-parts":[[2020,4,14]],"date-time":"2020-04-14T21:23:25Z","timestamp":1586899405000},"page":"1370-1384","source":"Crossref","is-referenced-by-count":154,"title":["SpEx: Multi-Scale Time Domain Speaker Extraction Network"],"prefix":"10.1109","volume":"28","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1584-6282","authenticated-orcid":false,"given":"Chenglin","family":"Xu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7237-0874","authenticated-orcid":false,"given":"Wei","family":"Rao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6257-7399","authenticated-orcid":false,"given":"Eng Siong","family":"Chng","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9158-9401","authenticated-orcid":false,"given":"Haizhou","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref73","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2015"},{"key":"ref72","article-title":"Csr-i (wsj0) complete ldc93s6a","author":"garofolo","year":"1993","journal-title":"Linguistic Data Consortium Philadelphia"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683855"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1038\/srep34390"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/TSA.2005.858005"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2001.941023"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1121\/1.1914702"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462116"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472683"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1150"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1093\/oxfordjournals.bmb.a070274"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462505"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952155"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462507"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-1176"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7471631"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462471"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2017.2726762"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952154"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2018.2795749"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2006.891312"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2018.2821903"},{"key":"ref61","first-page":"1","article-title":"Learning the speech front-end with raw waveform cldnns","author":"sainath","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref63","first-page":"1305","article-title":"Learning multiscale features directly from waveforms","author":"zhu","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2047419"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0205355"},{"key":"ref27","first-page":"97","article-title":"Super-human multi-talker speech recognition: The IBM 2006 Speech Separation Challenge System","author":"kristjansson","year":"0","journal-title":"Proc Int Conf Spoken Lang Process"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1037\/10037-000"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2008.09.001"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2015.2468583"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639038"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2014.2352935"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178061"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/j.bandc.2016.09.006"},{"key":"ref1","doi-asserted-by":"crossref","first-page":"975","DOI":"10.1121\/1.1907229","article-title":"Some experiments on the recognition of speech, with one and with two ears","volume":"25","author":"colin","year":"1953","journal-title":"J Acoust Soc Amer"},{"key":"ref20","first-page":"1457","article-title":"Non-negative matrix factorization with sparseness constraints","volume":"5","author":"hoyer","year":"2004","journal-title":"J Mach Learn Res"},{"key":"ref22","first-page":"2614","article-title":"Single-channel speech separation using sparse non-negative matrix factorization","author":"schmidt","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2006.1661352"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2006.885253"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2006.876726"},{"key":"ref26","first-page":"89","article-title":"Speech recognition using factorial hidden Markov models for separation in the feature space","author":"virtanen","year":"0","journal-title":"Proc Int Conf Spoken Lang Process"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2007.366322"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1101"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9004016"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1629"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2010.2042530"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1545"},{"key":"ref56","first-page":"165","article-title":"Deep neural network-based speaker embeddings for end-to-end speaker verification","author":"snyder","year":"0","journal-title":"Proc IEEE Workshop Spoken Lang Technol"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2064307"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682834"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2017.2696307"},{"key":"ref52","first-page":"1","article-title":"Complex spectrogram enhancement by convolutional neural network with multi-metrics learning","author":"fu","year":"0","journal-title":"Proc IEEE 27th Int Workshop Mach Learn Signal Process"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-64680-0"},{"key":"ref11","first-page":"26","article-title":"A study of learning based beamforming methods for speech recognition","author":"xiao","year":"0","journal-title":"Proc CHiME Workshop"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2290"},{"key":"ref12","first-page":"1273","article-title":"Target speaker extraction for multi-talker speaker verification","author":"rao","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref13","first-page":"2808","article-title":"Diarization is hard: Some experiences and lessons learned for the JHU team in the inaugural Dihard Challenge","author":"sell","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1983.1171927"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1121\/1.400725"},{"key":"ref82","first-page":"437","article-title":"Cracking the cocktail party problem by multi-beam deep attractor network","author":"chen","year":"0","journal-title":"Proc IEEE Autom Speech Recognit Understanding Workshop"},{"key":"ref16","article-title":"Prediction-driven computational auditory scene analysis","author":"ellis","year":"1996"},{"key":"ref81","first-page":"35","article-title":"A review of the cocktail party effect","volume":"12","author":"arons","year":"1992","journal-title":"J American Voice I\/O Society"},{"key":"ref17","first-page":"1277","article-title":"A harmonic-model-based front end for robust speech recognition","author":"seltzer","year":"0","journal-title":"Proc 8th Eur Conf Speech Commun Technol"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054266"},{"key":"ref18","author":"wang","year":"2006","journal-title":"Computational Auditory Scene Analysis Principles Algorithms and Applications"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461639"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2006.881700"},{"key":"ref80","article-title":"The cocktail party effect in auditory interfaces: A study of simultaneous presentation","volume":"2","author":"stifelman","year":"1994","journal-title":"Retrieved August"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1162\/0898929053467631"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.3758\/BF03196169"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1038\/nature11020"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1093\/cercor\/bhp124"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1093\/cercor\/bht355"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/MSPEC.2017.7864754"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1126\/sciadv.aav6134"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1098\/rstb.2016.0101"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683448"},{"key":"ref9","author":"li","year":"2015","journal-title":"Robust Automatic Speech Recognition A Bridge to Practical Applications"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683874"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1205"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683087"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682245"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2915167"},{"key":"ref41","article-title":"Tasnet: Surpassing ideal time-frequency masking for speech separation","author":"luo","year":"2018"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462661"},{"key":"ref43","first-page":"8","article-title":"Learning speaker representation for neural network based multichannel speaker extraction","author":"\u017emol\u00edkov\u00e1","year":"0","journal-title":"Proc IEEE Autom Speech Recognit Understanding Workshop"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/8938144\/09067003.pdf?arnumber=9067003","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,27]],"date-time":"2022-01-27T20:22:20Z","timestamp":1643314940000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9067003\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"references-count":86,"URL":"https:\/\/doi.org\/10.1109\/taslp.2020.2987429","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]}}}