{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:58:10Z","timestamp":1776884290945,"version":"3.51.2"},"reference-count":33,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2024,4,23]],"date-time":"2024-04-23T00:00:00Z","timestamp":1713830400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,4,23]],"date-time":"2024-04-23T00:00:00Z","timestamp":1713830400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"The National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62071302"],"award-info":[{"award-number":["62071302"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2024,6]]},"DOI":"10.1007\/s10772-024-10101-z","type":"journal-article","created":{"date-parts":[[2024,4,23]],"date-time":"2024-04-23T19:06:01Z","timestamp":1713899161000},"page":"299-306","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Improving low-complexity and real-time DeepFilterNet2 for personalized speech enhancement"],"prefix":"10.1007","volume":"27","author":[{"given":"Shilin","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haixin","family":"Guan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shuang","family":"Wei","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0924-408X","authenticated-orcid":false,"given":"Yanhua","family":"Long","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,4,23]]},"reference":[{"key":"10101_CR1","doi-asserted-by":"crossref","unstructured":"Desplanques, B., Jenthe, T., & Kris, D. (2020). EACAP-TDNN: Emphasized channel attention, propagation and aggregation in TDNN based speaker verification. In Interspeech-Proceedings, (pp. 3830\u20133834).","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"10101_CR2","unstructured":"Dubey, H., Aazami, A., Gopal, V., Naderi, B., Braun, S., Cutler, R., Ju, A., Zohourian, M., Tang, M., Gamper, H., Golestaneh, M., & Aichner, R. (2023). ICASSP 2023 deep speech enhancement challenge, in arXiv preprint arXiv:2303.11510."},{"key":"10101_CR3","doi-asserted-by":"crossref","unstructured":"Dubey, H., Gopal, V., Cutler, R., Aazami, A., Matusevych, S., Braun, S., & Eskimez, S. E., Thakker, M., Yoshioka, T., Gamper, H., & Aichner, R. (2022). ICASSP 2022 deep noise suppression challenge. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), (pp. 9271\u20139275).","DOI":"10.1109\/ICASSP43922.2022.9747230"},{"key":"10101_CR4","doi-asserted-by":"crossref","unstructured":"Eskimez, S. E., Yoshioka, T., Wang, H., Wang, X., Chen, Z., & Huang, X. (2022). Personalized speech enhancement: New models and comprehensive evaluation. In IEEE International Conference on Acoustics, Speech and Signal Processing ICA SSP, (pp. 356\u2013360).","DOI":"10.1109\/ICASSP43922.2022.9746962"},{"key":"10101_CR5","unstructured":"Ge, X., Han, J., Guan, H., & Long, Y. (2022). Dynamic acoustic compensation and adaptive focal training for personalized speech enhancement, in arXiv preprint arXiv:2211.12097."},{"key":"10101_CR6","doi-asserted-by":"crossref","unstructured":"Gemmeke, J. F., Ellis, D. P., Freedman, D., Jansen, A., Lawrence, W., Moore, R. C., Plakal, M., & Ritter, M. (2017). Audio set: An ontology and human-labeled dataset for audio events. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), (pp. 776\u2013780).","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"10101_CR7","doi-asserted-by":"crossref","unstructured":"Giri, R., Isik, U., & Krishnaswamy, A. (2019). Attention wave-u-net for speech enhancement. In Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), (pp. 249\u2013253).","DOI":"10.1109\/WASPAA.2019.8937186"},{"key":"10101_CR8","doi-asserted-by":"crossref","unstructured":"Han, J., Long, Y., Burget, L., & \u010cernock\u1ef3, J. (2022). DPCCN: Densely-connected pyramid complex convolutional network for robust speech separation and extraction. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), (pp. 7292\u20137296).","DOI":"10.1109\/ICASSP43922.2022.9747340"},{"key":"10101_CR9","doi-asserted-by":"crossref","unstructured":"He, S., Li, H., & Zhang, X (2020). Speakerfilter: Deep learning-based target speaker extraction using anchor speech. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), (pp. 376\u2013380).","DOI":"10.1109\/ICASSP40776.2020.9054222"},{"key":"10101_CR10","doi-asserted-by":"crossref","unstructured":"He, S., Li, H., & Zhang, X (2022). Speakerfilter-Pro: An improved target speaker extractor combines the time domain and frequency domain. In Processing ISCSLP, (pp. 473\u2013477).","DOI":"10.1109\/ISCSLP57327.2022.10037794"},{"key":"10101_CR11","unstructured":"Hsu, Y., Lee, Y., & Bai, M.R. (2022). Multi-channel target speech enhancement based on ERB-scaled spatial coherence features. In International Congress on Acoustics (ICA)."},{"key":"10101_CR12","doi-asserted-by":"crossref","unstructured":"Ju, Y., Rao, W., Yan, X., Fu, Y., Lv, S., Cheng, L., Wang, Y., Xie, L., & Shang, S. (2022). TEA-PSE: Tencent-ethereal-audio-lab personalized speech enhancement system for ICASSP 2022 DNS challenge. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), (pp. 9291\u20139295).","DOI":"10.1109\/ICASSP43922.2022.9747765"},{"key":"10101_CR13","doi-asserted-by":"crossref","unstructured":"Ju, Y., Zhang, S., Rao, W., Wang, Y., Yu, T., Xie, L., & Shang, S. (2023). TEA-PSE 2.0: Sub-band network for real-time personalized speech enhancement. In IEEE Spoken Language Technology Workshop (SLT), (pp. 472\u2013479).","DOI":"10.1109\/SLT54892.2023.10023174"},{"key":"10101_CR14","doi-asserted-by":"crossref","unstructured":"Kim, J., El-Khamy, M., & Lee, J. (2020). T-GSA: Transformer with Gaussian-weighted self-attention for speech enhancement. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), (pp. 6649\u20136653).","DOI":"10.1109\/ICASSP40776.2020.9053591"},{"key":"10101_CR15","doi-asserted-by":"crossref","unstructured":"Koizumi, Y., Yatabe, K., & Delcroix, M., et. al. (2020). Speech enhancement using self-adaptation and multi-head self-attention. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), (pp. 181\u2013185).","DOI":"10.1109\/ICASSP40776.2020.9053214"},{"key":"10101_CR16","doi-asserted-by":"crossref","unstructured":"Le Roux, J., Wisdom, S., Erdogan, H., & Hershey, J.R. (2019). SDR\u2013half-baked or well done? In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), (pp. 626\u2013630).","DOI":"10.1109\/ICASSP.2019.8683855"},{"key":"10101_CR17","unstructured":"Loshchilov, I., & Hutter, F. (2019). Decoupled weight decay regularization. In The Seventh International Conference on Learning Representations (ICLR)."},{"key":"10101_CR18","doi-asserted-by":"publisher","first-page":"61","DOI":"10.1109\/LSP.2019.2955818","volume":"27","author":"W Mack","year":"2019","unstructured":"Mack, W., & Habets, E. A. (2019). Deep filtering: Signal extraction and reconstruction using complex time-frequency filters. IEEE Signal Processing Letters, 27, 61\u201365.","journal-title":"IEEE Signal Processing Letters"},{"key":"10101_CR19","doi-asserted-by":"crossref","unstructured":"Naderi, B., & Cutler, R. (2021). Subjective evaluation of noise suppression algorithms in crowdsourcing. In Interspeech-Proceedings, (pp. 2132\u20132136).","DOI":"10.21437\/Interspeech.2021-343"},{"key":"10101_CR20","doi-asserted-by":"publisher","first-page":"80","DOI":"10.1016\/j.specom.2020.10.004","volume":"125","author":"A Nicolson","year":"2020","unstructured":"Nicolson, A., & Paliwal, K. (2020). Masked multi-head self-attention for causal speech enhancement. Speech Communication, 125, 80\u201396.","journal-title":"Speech Communication"},{"key":"10101_CR21","doi-asserted-by":"publisher","first-page":"1270","DOI":"10.1109\/TASLP.2021.3064421","volume":"29","author":"A Pandey","year":"2021","unstructured":"Pandey, A., & Wang, D. (2021). Dense CNN with self-attention for time-domain speech enhancement. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 29, 1270\u20131279.","journal-title":"IEEE\/ACM transactions on Audio, Speech, and Language Processing"},{"key":"10101_CR22","doi-asserted-by":"crossref","unstructured":"Reddy, C. K., Gopal, V., et al. (2022). DNSMOS P. 835: A non-intrusive perceptual objective speech quality metric to evaluate noise suppressors, In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), (pp. 886\u2013890).","DOI":"10.1109\/ICASSP43922.2022.9746108"},{"key":"10101_CR23","doi-asserted-by":"crossref","unstructured":"Schroter, H., Escalante-B, A. N., Rosenkranz, T., & Maier, A. (2022). DeepFilterNet: A low complexity speech enhancement framework for full-band audio based on deep filtering. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), (pp. 7407\u20137411).","DOI":"10.1109\/ICASSP43922.2022.9747055"},{"key":"10101_CR24","doi-asserted-by":"crossref","unstructured":"Schr\u00f6ter, H., Maier, A., Escalante-B, A., & Rosenkranz, T. (2022). DeepFilterNet2: Towards real-time speech enhancement on embedded devices for full-band audio. In International Workshop on Acoustic Signal Enhancement (IWAENC), (pp. 1\u20135).","DOI":"10.1109\/IWAENC53105.2022.9914782"},{"key":"10101_CR25","unstructured":"Schr\u00f6ter, H., Rosenkranz, T., & Maier, A., et. al. (2023). DeepFilterNet: Perceptually motivated real-time speech enhancement. In Interspeech-Proceedings, (pp 2008\u20132009)."},{"key":"10101_CR26","doi-asserted-by":"publisher","first-page":"2125","DOI":"10.1109\/TASL.2011.2114881","volume":"19","author":"CH Taal","year":"2011","unstructured":"Taal, C. H., Hendriks, R. C., Heusdens, R., & Jensen, J. (2011). An algorithm for intelligibility prediction of time-frequency weighted noisy speech. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 19, 2125\u20132136.","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10101_CR27","doi-asserted-by":"crossref","unstructured":"Thiemann, J., Ito, N., & Vincent, E. (2013) The diverse environments multi-channel acoustic noise database (demand): A database of multichannel environmental noise recordings. In Proceedings of Meetings on Acoustics, (vol. 19).","DOI":"10.1121\/1.4799597"},{"key":"10101_CR28","unstructured":"Union, I. (2007). Wideband extension to recommendation p. 862 for the assessment of wideband telephone networks and speech codecs. In International Telecommunication Union, Recommendation P, (vol. 25)."},{"key":"10101_CR30","doi-asserted-by":"crossref","unstructured":"Wang, H., & Wang, D. (2022). Cross-domain speech enhancement with a neural cascade architecture. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), (pp. 7862\u20137866).","DOI":"10.1109\/ICASSP43922.2022.9747752"},{"key":"10101_CR29","doi-asserted-by":"crossref","unstructured":"Wang, Q., Muckenhirn, H., Wilson, K., Sridhar, P., Wu, Z., Hershey, J., Saurous, R. A., Weiss, R. J., Jia, Y., Moreno, I.L. (2019). Voicefilter: Targeted voice separation by speaker-conditioned spectrogram masking. In  Interspeech - Proceedings, (pp. 2728\u20132732).","DOI":"10.21437\/Interspeech.2019-1101"},{"key":"10101_CR31","doi-asserted-by":"crossref","unstructured":"Zhao, S., Ma, B., Watcharasupat, K. N., & Gan, W. (2022). FRCRN: Boosting feature representation using frequency recurrence for monaural speech enhancement. In IEEE International Conference on Acoustics, Speech and Signal Processing ICA SSP)=, (pp. 9281\u20139285).","DOI":"10.1109\/ICASSP43922.2022.9747578"},{"key":"10101_CR32","doi-asserted-by":"publisher","first-page":"1598","DOI":"10.1109\/TASLP.2020.2995273","volume":"28","author":"Y Zhao","year":"2020","unstructured":"Zhao, Y., Wang, D., Xu, B., & Zhang, T. (2020). Monaural speech dereverberation using temporal convolutional networks with self attention. IEEE\/ACM transactions on Audio, Speech, and Language Processing, 28, 1598\u20131607.","journal-title":"IEEE\/ACM transactions on Audio, Speech, and Language Processing"},{"key":"10101_CR33","doi-asserted-by":"publisher","first-page":"800","DOI":"10.1109\/JSTSP.2019.2922820","volume":"13","author":"K \u017dmol\u00edkov\u00e1","year":"2019","unstructured":"\u017dmol\u00edkov\u00e1, K., Delcroix, M., Kinoshita, K., Ochiai, T., Nakatani, T., Burget, L., & \u010cernock\u1ef3, J. (2019). Speakerbeam: Speaker aware neural network for target speaker extraction in speech mixtures. IEEE Journal of Selected Topics in Signal Processing, 13, 800\u2013814.","journal-title":"IEEE Journal of Selected Topics in Signal Processing"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-024-10101-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10772-024-10101-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-024-10101-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,22]],"date-time":"2024-07-22T16:05:40Z","timestamp":1721664340000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10772-024-10101-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,23]]},"references-count":33,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2024,6]]}},"alternative-id":["10101"],"URL":"https:\/\/doi.org\/10.1007\/s10772-024-10101-z","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"value":"1381-2416","type":"print"},{"value":"1572-8110","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,4,23]]},"assertion":[{"value":"16 December 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 March 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 April 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"the authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}