{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:24:04Z","timestamp":1775229844794,"version":"3.50.1"},"reference-count":55,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,1,19]]},"DOI":"10.1109\/slt48900.2021.9383615","type":"proceedings-article","created":{"date-parts":[[2021,3,25]],"date-time":"2021-03-25T16:46:54Z","timestamp":1616690814000},"page":"785-792","source":"Crossref","is-referenced-by-count":63,"title":["ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for ASR Integration"],"prefix":"10.1109","author":[{"given":"Chenda","family":"Li","sequence":"first","affiliation":[]},{"given":"Jing","family":"Shi","sequence":"additional","affiliation":[]},{"given":"Wangyou","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Aswin Shanmugam","family":"Subramanian","sequence":"additional","affiliation":[]},{"given":"Xuankai","family":"Chang","sequence":"additional","affiliation":[]},{"given":"Naoyuki","family":"Kamo","sequence":"additional","affiliation":[]},{"given":"Moto","family":"Hira","sequence":"additional","affiliation":[]},{"given":"Tomoki","family":"Hayashi","sequence":"additional","affiliation":[]},{"given":"Christoph","family":"Boeddeker","sequence":"additional","affiliation":[]},{"given":"Zhuo","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","first-page":"749","article-title":"Perceptual evaluation of speech quality (PESQ)-a new method for speech quality assessment of tele-phone networks and codecs","volume":"2","author":"rix","year":"2001","journal-title":"Proc IEEE ICASSP"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2114881"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639038"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2008.09.001"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2019.2911179"},{"key":"ref30","author":"van trees","year":"2004","journal-title":"Optimum Array Processing Part IV of Detection Estimation and Modulation Theory"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TSA.2005.858005"},{"key":"ref36","article-title":"Unsupervised sound separation using mixtures of mixtures","author":"wisdom","year":"2020","journal-title":"ICML 2020 Workshop on Self-supervision in Audio and Speech"},{"key":"ref35","first-page":"708","article-title":"Phase-sensitive and recognition-boosted speech separation using deep recurrent neural networks","author":"erdogan","year":"2015","journal-title":"Proc IEEE ICASSP"},{"key":"ref34","first-page":"1849","article-title":"On training targets for supervised speech separation","volume":"22","author":"wang","year":"2014","journal-title":"IEEE\/ACM Trans ASLP"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-11130-3_6"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7471631"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2009.2025790"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1121\/1.1907229"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1201\/b14529"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1186\/s13634-016-0306-6"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2915167"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462116"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952154"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054266"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-552"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7471664"},{"key":"ref50","article-title":"NARA-WPE: A Python package for weighted prediction error dereverberation in Numpy and Ten-sorflow for online and offline processing","author":"drude","year":"2018","journal-title":"13 ITG Fachtagung Sprachkommunikation (ITG 2018)"},{"key":"ref51","article-title":"Audio augmentation for speech recognition","author":"ko","year":"2015","journal-title":"Proc ISCA Interspeech"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683087"},{"key":"ref54","article-title":"An investigation of end-to-end multichannel speech recognition for reverberant and mismatch conditions","author":"subramanian","year":"2019"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054029"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1121\/1.382599"},{"key":"ref10","article-title":"Onssen: an open-source speech separation and enhancement library","author":"ni","year":"2019"},{"key":"ref11","doi-asserted-by":"crossref","first-page":"1667","DOI":"10.21105\/joss.01667","article-title":"Open-Unmix &#x2013; a reference implementation for music source separation","volume":"4","author":"st\u00f6ter","year":"2019","journal-title":"Journal of Open Source Soft-ware"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-1176"},{"key":"ref12","first-page":"2637","article-title":"Asteroid: The PyTorch-based audio source separation toolkit for researchers","author":"pariente","year":"2020","journal-title":"Proc ISCA Inter-speech"},{"key":"ref13","first-page":"8026","article-title":"PyTorch: An imperative style, high-performance deep learning library","author":"paszke","year":"2019","journal-title":"NeurIPS"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA.2019.8937250"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003986"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053692"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053426"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2519"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2016.11.005"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1002\/9781119279860"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/89.966083"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2017.2726762"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2019.2918706"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4020-6479-1"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2007.902460"},{"key":"ref9","first-page":"297","article-title":"The northwestern university source separation library","author":"manilow","year":"2018","journal-title":"Proc International Society for Music Information Retrieval (ISMIR)"},{"key":"ref46","first-page":"696","article-title":"WHAMR!: Noisy and Reverberant Single-Channel Speech Separation","author":"maciejewski","year":"2019","journal-title":"Proc IEEE ICASSP"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2821"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref47","year":"1993","journal-title":"LDC Catalog CSR-I (WSJ0) Complete"},{"key":"ref42","first-page":"2629","article-title":"The DIRHA simulated corpus","author":"cristoforetti","year":"2014","journal-title":"the Ninth Inter-national Conference on Language Resources and Evaluation (LREC&#x2019;14)"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461639"},{"key":"ref44","article-title":"LibriMix: An open-source dataset for generalizable speech separation","author":"cosentino","year":"2020"},{"key":"ref43","article-title":"SMS-WSJ: Database, performance measures, and baseline recipe for multi-channel source separation and recognition","author":"drude","year":"2019"}],"event":{"name":"2021 IEEE Spoken Language Technology Workshop (SLT)","location":"Shenzhen, China","start":{"date-parts":[[2021,1,19]]},"end":{"date-parts":[[2021,1,22]]}},"container-title":["2021 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9383468\/9383452\/09383615.pdf?arnumber=9383615","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,5,3]],"date-time":"2021-05-03T17:40:18Z","timestamp":1620063618000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9383615\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,1,19]]},"references-count":55,"URL":"https:\/\/doi.org\/10.1109\/slt48900.2021.9383615","relation":{},"subject":[],"published":{"date-parts":[[2021,1,19]]}}}