{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,28]],"date-time":"2026-02-28T18:20:35Z","timestamp":1772302835711,"version":"3.50.1"},"reference-count":31,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2021,2,3]],"date-time":"2021-02-03T00:00:00Z","timestamp":1612310400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,2,3]],"date-time":"2021-02-03T00:00:00Z","timestamp":1612310400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"name":"Natural Science Fund Project of China","award":["No.61301295"],"award-info":[{"award-number":["No.61301295"]}]},{"name":"the Anhui Natural Science Fund Project","award":["No.1708085MF151"],"award-info":[{"award-number":["No.1708085MF151"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Cogn Comput"],"published-print":{"date-parts":[[2022,5]]},"DOI":"10.1007\/s12559-020-09817-2","type":"journal-article","created":{"date-parts":[[2021,2,3]],"date-time":"2021-02-03T06:05:53Z","timestamp":1612332353000},"page":"1152-1158","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":39,"title":["SETransformer: Speech Enhancement Transformer"],"prefix":"10.1007","volume":"14","author":[{"given":"Weiwei","family":"Yu","sequence":"first","affiliation":[]},{"given":"Jian","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"HuaBin","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Liang","family":"Tao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,2,3]]},"reference":[{"key":"9817_CR1","doi-asserted-by":"crossref","unstructured":"Wang W, Xing C, Wang D, et al. A Robust Audio-Visual Speech Enhancement Model, in ICASSP 2020 \u2013\u00a045th IEEE International Conference on Acoustics, Speech and Signal Processing, May 4\u20138. Virtual Barcelona. 2020.:p. 7529\u201333.","DOI":"10.1109\/ICASSP40776.2020.9053033"},{"key":"9817_CR2","doi-asserted-by":"crossref","unstructured":"Li L, Wang D, Zheng F. Neural Discriminant Analysis for Deep Speaker Embedding. in arXiv preprint arXiv:2005.11905, 2020.","DOI":"10.21437\/Interspeech.2020-2542"},{"key":"9817_CR3","doi-asserted-by":"crossref","unstructured":"Gogate M, Dashtipour K, Bell P, et al. Deep Neural Network Driven Binaural Audio Visual Speech Separation, in 2020 International Joint Conference on Neural Networks (IJCNN), 2020.\u00a0 p. 1\u20137.","DOI":"10.1109\/IJCNN48605.2020.9207517"},{"key":"9817_CR4","unstructured":"Gogate M, Adeel A, Dashtipour K, et al. AV Speech Enhancement Challenge using a Real Noisy Corpus. in arXiv preprint 2019. arXiv:1910.00424"},{"key":"9817_CR5","doi-asserted-by":"crossref","unstructured":"Gogate M, Dashtipour K, Adeel A, et al. Cochleanet: A robust language-independent audio-visual model for speech enhancement, in arXiv preprint 2019. arXiv:1909.10407","DOI":"10.1016\/j.inffus.2020.04.001"},{"issue":"1","key":"9817_CR6","doi-asserted-by":"publisher","first-page":"65","DOI":"10.1109\/LSP.2013.2291240","volume":"21","author":"Y Xu","year":"2014","unstructured":"Xu Y, Du J, Dai L, Lee CH. An Experimental Study on Speech Enhancement Based on Deep Neural Networks. IEEE Signal Processing Letters. 2014;21(1):65\u20138.","journal-title":"IEEE Signal Processing Letters."},{"key":"9817_CR7","doi-asserted-by":"crossref","unstructured":"Narayanan A, Wang D. Ideal ratio mask estimation using deep neural networks for robust speech recognition, in ICASSP 2013 \u2013 38th IEEE International Conference on Acoustics, Speech and Signal Processing, May 26\u201331, Vancouver, BC, Canada; 2013. p. 7092\u20136.","DOI":"10.1109\/ICASSP.2013.6639038"},{"key":"9817_CR8","doi-asserted-by":"crossref","unstructured":"Xu Y, Du J, Dai L, Lee CH. Multi-Objective Learning and Mask-Based Post-Processing for Deep Neural Network Based Speech Enhancement, in Interspeech 2015 \u2013 15th Annual Conference of the International Speech Communication Association, September 6\u201310, Gremany, Austria; 2015. p. 1058\u2013512.","DOI":"10.21437\/Interspeech.2015-358"},{"key":"9817_CR9","doi-asserted-by":"crossref","unstructured":"Xu Y, Du J, Dai L, Lee CH. A Regression Approach to Speech Enhancement Based on Deep Neural Networks, in IEEE Transactions on Acoustics, Speech and Signal Processing. 2015;23(1):7\u201319.","DOI":"10.1109\/TASLP.2014.2364452"},{"key":"9817_CR10","doi-asserted-by":"crossref","unstructured":"Kounovsky T, Malek J. Single channel speech enhancement using convolutional neural network, in ECMSM 2017 \u2013 15th IEEE International Workshop of Electronics, May 24\u201326, Donostia-San Sebastian, Spain; 2017. p. 1\u20135.","DOI":"10.1109\/ECMSM.2017.7945915"},{"key":"9817_CR11","doi-asserted-by":"crossref","unstructured":"Park SR, Lee J. A fully convolutional neural network for speech enhancement, in Interspeech 2017 \u2013 17th Annual Conference of the International Speech Communication Association, August 20\u201324, Stockholm, Sweden; 2017. p. 1993\u20137.","DOI":"10.21437\/Interspeech.2017-1465"},{"key":"9817_CR12","doi-asserted-by":"crossref","unstructured":"Fu S, Tsao Y, Lu X. Raw Waveform-based Speech Enhancement by Fully Con- volutional Networks, in APSIPA ASC 2017 \u2013 9th Asia-Pacific Signal and Information Processing Association Annual Summit and Conference, December 12\u201315, Kuala Lumpur, Malaysia; 2017. p. 6\u201312.","DOI":"10.1109\/APSIPA.2017.8281993"},{"key":"9817_CR13","doi-asserted-by":"crossref","unstructured":"Grais EM, Plumbley MD, Single Channel Audio Source Separation using Con- volutional Denoising Autoencoders, in GlobalSIP 2017 \u2013 5th IEEE Global Conference on Signal and Information Processing, November 14\u201316, Montreal, QC, Canada; 2017, p. 1265\u20139.","DOI":"10.1109\/GlobalSIP.2017.8309164"},{"key":"9817_CR14","doi-asserted-by":"crossref","unstructured":"Huang P, Kim M, Hasegawa-Johnson M, Smaragdis P. Joint Optimization of Masks and Deep Recurrent Neural Networks for Monaural Source Separation, in IEEE Transactions on Acoustics, Speech and Signal Processing. 2015;23(12):2136\u201347.","DOI":"10.1109\/TASLP.2015.2468583"},{"key":"9817_CR15","doi-asserted-by":"crossref","unstructured":"Sun L, Du J, Dai L, Lee C. Multiple-target deep learning for LSTM-RNN based speech enhancement, in HSCMA 2017 \u2013 15th Hands-free Speech Communications and Microphone Arrays, March 1\u20133, San Francisco, California; 2017. p. 136\u201340.","DOI":"10.1109\/HSCMA.2017.7895577"},{"key":"9817_CR16","doi-asserted-by":"crossref","unstructured":"Sainath TN, Vinyals O, Senior A, Sak H. Convolutional, Long Short-Term Memory, fully connected Deep Neural Networks, in ICASSP 2015 \u2013 40th IEEE International Conference on Acoustics, Speech and Signal Processing, April 19\u201324, Brisbane, QLD, Australia; 2015, p. 4580\u20134.","DOI":"10.1109\/ICASSP.2015.7178838"},{"key":"9817_CR17","doi-asserted-by":"crossref","unstructured":"Mimilakis SI, Drossos K, Santos JF, Schuller G, Virtanen T, Bengio Y. Monaural Singing Voice Separation with Skip-Filtering Connections and Recurrent Inference of Time-Frequency Mask, in ICASSP 2018 \u2013 43th IEEE International Conference on Acoustics, Speech and Signal Processing, April 15\u201320, Calgary, AB, Canada;\n2018. p. 721\u20135.","DOI":"10.1109\/ICASSP.2018.8461822"},{"key":"9817_CR18","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser L, Polosukhin I. Attention is all you need. Adv Neural Inf Process Syst. 2017, p. 5998\u20136008."},{"key":"9817_CR19","unstructured":"Wang Z, Ma Y, Liu Z, Tang J. R-Transformer: Recurrent Neural Network Enhanced Transformer, in arXiv preprint, 2019. arXiv: 1907.05572"},{"key":"9817_CR20","unstructured":"Wisdom S, Powers T, Hershey J, Roux JL, Atlas L. Full-capacity unitary recurrent neural networks. Adv Neural Inf Process Syst. 2016;4880\u20138."},{"key":"9817_CR21","unstructured":"Ba, JL Kiros JR, Hinton GE. Jonas Gehring, Michael Auli, David Grangier, Denis Yarats, and Yann N Dauphin, in arXiv preprint, 2016. arXiv:1607.06450"},{"key":"9817_CR22","unstructured":"Gehring J, Auli M, Grangier D, Yarats D, Dauphin YN. Convolutional sequence to sequence learning, in Proceedings of the 34th International Conference on Machine Learning. 2017;70:1243\u201352."},{"key":"9817_CR23","doi-asserted-by":"crossref","unstructured":"Garofolo JS, Lamel LF, Fisher WM, Fiscus JG, Pallett DS. ARPA TIMIT acoustic-phonetic continuous speech corpus CD-ROM. NIST speech disc 1-1.1, in NASA STI\/Recon technical report n. 1993;93.","DOI":"10.6028\/NIST.IR.4930"},{"key":"9817_CR24","unstructured":"Snyder D, Chen G, Povey D. Musan: A music, speech, and noise corpus, in arXiv preprint, 2015. arXiv:1510.08484"},{"key":"9817_CR25","unstructured":"Kingma D, Ba J. Musan: A music, speech, and noise corpus, in arXiv preprint, 2014. arXiv:1412.6980"},{"key":"9817_CR26","unstructured":"Srivastava N, Hinton G, Krizhevsky A, Sutskever I, Salakhutdinov R, Dropout: a simple way to prevent neural networks from overfitting, J Mach Lear Res. 2014;1929\u201358."},{"key":"9817_CR27","unstructured":"Valentini C, Wang X, Takaki S, Yamagishi J. Investigating rnn-based speech enhancement methods for noise-robust text-to-speech, 9th ISCA Speech Synthesis Workshop. 2016; p. 146\u201352."},{"issue":"3\u20134","key":"9817_CR28","doi-asserted-by":"publisher","first-page":"383","DOI":"10.1007\/s12021-018-9377-x","volume":"16","author":"Y Xue","year":"2018","unstructured":"Xue Y, Xu T, Zhang H, Srivastava N, Hinton G, Krizhevsky A, Sutskever I, Salakhutdinov R. Segan: Adversarial network with multi-scale l 1 loss for medical image segmentation. Neuroinformatics. 2018;16(3\u20134):383\u201339.","journal-title":"Neuroinformatics."},{"key":"9817_CR29","doi-asserted-by":"crossref","unstructured":"Shah N, Patil A, Soni H. Time-frequency mask-based speech enhancement using convolutional generative adversarial network, Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). 2018; p. 1246\u201351.","DOI":"10.23919\/APSIPA.2018.8659692"},{"key":"9817_CR30","unstructured":"Jansson A, Humphrey E, Montecchio N, et al. Singing voice separation with deep u-net convolutional networks, n Proceedings of the International Society for Music Information Retrieval Conference (ISMIR). 2017; p. 323\u201332."},{"key":"9817_CR31","doi-asserted-by":"crossref","unstructured":"Luo Y, Mesgarani N. Tasnet: time-domain audio separation network for real-time, single-channel speech separation, ICASSP 2018 \u2013 43th IEEE International Conference on Acoustics, Speech and Signal Processing, Apr 22\u201327, Seoul, South Korea. 2018; p. 696\u2013700.","DOI":"10.1109\/ICASSP.2018.8462116"}],"container-title":["Cognitive Computation"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s12559-020-09817-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s12559-020-09817-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s12559-020-09817-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,17]],"date-time":"2022-05-17T11:40:18Z","timestamp":1652787618000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s12559-020-09817-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,2,3]]},"references-count":31,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2022,5]]}},"alternative-id":["9817"],"URL":"https:\/\/doi.org\/10.1007\/s12559-020-09817-2","relation":{},"ISSN":["1866-9956","1866-9964"],"issn-type":[{"value":"1866-9956","type":"print"},{"value":"1866-9964","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,2,3]]},"assertion":[{"value":"29 September 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 December 2020","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 February 2021","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of Interest"}},{"value":"This article does not contain any studies with human participants or animals performed by any of the authors.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical Approval"}}]}}