{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T17:57:29Z","timestamp":1769104649496,"version":"3.49.0"},"reference-count":55,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2025,7,23]],"date-time":"2025-07-23T00:00:00Z","timestamp":1753228800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,7,23]],"date-time":"2025-07-23T00:00:00Z","timestamp":1753228800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62271344"],"award-info":[{"award-number":["62271344"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SIViP"],"published-print":{"date-parts":[[2025,11]]},"DOI":"10.1007\/s11760-025-04427-7","type":"journal-article","created":{"date-parts":[[2025,7,23]],"date-time":"2025-07-23T12:10:35Z","timestamp":1753272635000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["A multi-task Generalized Speech Restoration model using Depthwise Separable Convolution Layers"],"prefix":"10.1007","volume":"19","author":[{"given":"Ikram","family":"Azaz","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tao","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yasir","family":"Iqbal","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xin","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yanzhang","family":"Geng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,7,23]]},"reference":[{"key":"4427_CR1","doi-asserted-by":"crossref","unstructured":"Wang, D., Chen, J.: Supervised speech separation based on deep learning: An overview, IEEE\/ACM transactions on audio, speech, and language processing, vol. 26, no. 10, pp. 1702\u20131726, (2018)","DOI":"10.1109\/TASLP.2018.2842159"},{"key":"4427_CR2","unstructured":"Byun, K., Filos, J., Visser, E., Moon, S.: VC-ENHANCE: Speech Restoration with Integrated Noise Suppression and Voice Conversion, arXiv preprint arXiv:2409.06126, (2024)"},{"key":"4427_CR3","doi-asserted-by":"crossref","unstructured":"Liu, H., et al.: Voicefixer: A unified framework for high-fidelity speech restoration, arXiv preprint arXiv:2204.05841, (2022)","DOI":"10.21437\/Interspeech.2022-11026"},{"issue":"11","key":"4427_CR4","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1145\/3422622","volume":"63","author":"I Goodfellow","year":"2020","unstructured":"Goodfellow, I., et al.: Generative adversarial networks. Commun. ACM. 63(11), 139\u2013144 (2020)","journal-title":"Commun. ACM"},{"key":"4427_CR5","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"4427_CR6","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.specom.2020.04.002","volume":"122","author":"A Azarang","year":"2020","unstructured":"Azarang, A., Kehtarnavaz, N.: A review of multi-objective deep learning speech denoising methods. Speech Commun. 122, 1\u201310 (2020)","journal-title":"Speech Commun."},{"issue":"12","key":"4427_CR7","doi-asserted-by":"publisher","first-page":"8675","DOI":"10.1007\/s11760-024-03500-x","volume":"18","author":"Y Iqbal","year":"2024","unstructured":"Iqbal, Y., et al.: Speech enhancement using deep complex convolutional neural network (DCCNN) model. Signal. Image Video Process. 18(12), 8675\u20138692 (2024)","journal-title":"Signal. Image Video Process."},{"key":"4427_CR8","doi-asserted-by":"crossref","unstructured":"Baby, D., Bourlard, H.: Speech dereverberation using variational autoencoders, in ICASSP 2021\u20132021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP),: IEEE, pp. 5784\u20135788. (2021)","DOI":"10.1109\/ICASSP39728.2021.9414736"},{"key":"4427_CR9","doi-asserted-by":"crossref","unstructured":"Kim, S.-B., Lee, S.-H., Choi, H.-Y., Lee, S.-W.: Audio Super-Resolution with robust speech representation learning of masked autoencoder. IEEE\/ACM Trans. Audio Speech Lang. Process., (2024)","DOI":"10.1109\/TASLP.2023.3349053"},{"key":"4427_CR10","doi-asserted-by":"publisher","first-page":"3032","DOI":"10.1109\/TASLP.2022.3205759","volume":"30","author":"Z Pan","year":"2022","unstructured":"Pan, Z., Ge, M., Li, H.: Universal speaker extraction with visual cue. IEEE\/ACM Trans. Audio Speech Lang. Process. 30, 3032\u20133045 (2022)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"4427_CR11","doi-asserted-by":"crossref","unstructured":"Li, J.: Recent advances in end-to-end automatic speech recognition. APSIPA Trans. Signal. Inform. Process., 11, 1, (2022)","DOI":"10.1561\/116.00000050"},{"key":"4427_CR12","doi-asserted-by":"publisher","first-page":"131858","DOI":"10.1109\/ACCESS.2021.3112535","volume":"9","author":"S Alharbi","year":"2021","unstructured":"Alharbi, S., et al.: Automatic speech recognition: Systematic literature review. Ieee Access. 9, 131858\u2013131876 (2021)","journal-title":"Ieee Access."},{"issue":"2","key":"4427_CR13","doi-asserted-by":"publisher","first-page":"216","DOI":"10.1108\/WJE-06-2021-0324","volume":"19","author":"HB Vanjari","year":"2022","unstructured":"Vanjari, H.B., Kolte, M.T.: Machine learning improvements to compressive sensing for speech enhancement in hearing aid applications. World J. Eng. 19(2), 216\u2013223 (2022)","journal-title":"World J. Eng."},{"key":"4427_CR14","doi-asserted-by":"crossref","unstructured":"Hartvigsen, T., Gabriel, S., Palangi, H., Sap, M., Ray, D., Kamar, E.: Toxigen: A large-scale machine-generated dataset for adversarial and implicit hate speech detection, arXiv preprint arXiv:2203.09509, (2022)","DOI":"10.18653\/v1\/2022.acl-long.234"},{"key":"4427_CR15","doi-asserted-by":"crossref","unstructured":"Li, X., Wang, Q., Liu, X.: MaskSR: Masked Language Model for Full-band Speech Restoration, arXiv preprint arXiv:2406.02092, (2024)","DOI":"10.21437\/Interspeech.2024-1584"},{"key":"4427_CR16","doi-asserted-by":"crossref","unstructured":"Wang, X., et al.: Speechx: Neural codec Language model as a versatile speech transformer. IEEE\/ACM Trans. Audio Speech Lang. Process., (2024)","DOI":"10.1109\/TASLP.2024.3419418"},{"key":"4427_CR17","unstructured":"Liu, A.H., Le, M., Vyas, A., Shi, B., Tjandra, A., Hsu, W.-N.: Generative pre-training for speech with flow matching, arXiv preprint arXiv:2310.16338, (2023)"},{"key":"4427_CR18","doi-asserted-by":"crossref","unstructured":"Ku, P.-J., Liu, A.H., Korostik, R., Huang, S.-F., Fu, S.-W., Juki\u0107, A.: Generative speech foundation model pretraining for high-quality speech extraction and restoration, arXiv preprint arXiv:2409.16117, (2024)","DOI":"10.1109\/ICASSP49660.2025.10888830"},{"key":"4427_CR19","unstructured":"Yang, D., et al.: UniAudio: Towards Universal Audio Generation with Large Language Models, in Forty-first International Conference on Machine Learning"},{"key":"4427_CR20","unstructured":"Lipman, Y., Chen, R.T., Ben-Hamu, H., Nickel, M., Le, M.: Flow matching for generative modeling, arXiv preprint arXiv:2210.02747, (2022)"},{"key":"4427_CR21","doi-asserted-by":"crossref","unstructured":"Liu, H., Chen, K., Tian, Q., Wang, W., Plumbley, M.D.: AudioSR: Versatile audio super-resolution at scale, in ICASSP 2024\u20132024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP),: IEEE, pp. 1076\u20131080. (2024)","DOI":"10.1109\/ICASSP48485.2024.10447246"},{"issue":"2","key":"4427_CR22","doi-asserted-by":"publisher","first-page":"831","DOI":"10.1007\/s11042-024-19076-0","volume":"84","author":"V Parisae","year":"2025","unstructured":"Parisae, V., Bhavanam, S.N.: Adaptive attention mechanism for single channel speech enhancement. Multimedia Tools Appl. 84(2), 831\u2013856 (2025)","journal-title":"Multimedia Tools Appl."},{"key":"4427_CR23","doi-asserted-by":"crossref","unstructured":"Parisae, V., Nagakishore Bhavanam, S.: Stacked u-net with time\u2013frequency attention and deep connection net for single channel speech enhancement. Int. J. Image Graphics, p. 2550067, (2024)","DOI":"10.1142\/S0219467825500676"},{"key":"4427_CR24","doi-asserted-by":"crossref","unstructured":"Sivarambabu, P., Agrawal, R., Tirumala, A., Subani, S.M., Parisae, V., Nukala, S.S.: Enhancing cloud security through AI-Driven intrusion detection utilizing deep learning methods and autoencoder technology. Generative Artif. Intelligence: Concepts Appl., pp. 249\u2013264, (2025)","DOI":"10.1002\/9781394209835.ch15"},{"key":"4427_CR25","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural. Inf. Process. Syst., 30, (2017)"},{"key":"4427_CR26","unstructured":"Gu, A., Dao, T.: Mamba: Linear-time sequence modeling with selective state spaces, arXiv preprint arXiv:2312.00752, (2023)"},{"key":"4427_CR27","doi-asserted-by":"crossref","unstructured":"Chao, R., et al.: An investigation of incorporating Mamba for speech enhancement, in. In: 2024 IEEE Spoken Language Technology Workshop (SLT), pp. 302\u2013308. IEEE (2024)","DOI":"10.1109\/SLT61566.2024.10832332"},{"key":"4427_CR28","doi-asserted-by":"publisher","unstructured":"Chollet, F.: Xception: Deep Learning with Depthwise Separable Convolutions, in 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 21\u201326 July 2017 2017, pp. 1800\u20131807. https:\/\/doi.org\/10.1109\/CVPR.2017.195","DOI":"10.1109\/CVPR.2017.195"},{"key":"4427_CR29","doi-asserted-by":"publisher","unstructured":"Shahamiri, S.R., Mandal, K., Sarkar, S.: Dysarthric speech recognition: An investigation on using depthwise separable convolutions And residual connections. Neural Comput. Appl., 2024\/12\/17 2024, https:\/\/doi.org\/10.1007\/s00521-024-10870-3","DOI":"10.1007\/s00521-024-10870-3"},{"issue":"1","key":"4427_CR30","doi-asserted-by":"publisher","first-page":"70","DOI":"10.1109\/TPDS.2021.3084813","volume":"33","author":"G Lu","year":"2021","unstructured":"Lu, G., Zhang, W., Wang, Z.: Optimizing depthwise separable Convolution operations on Gpus. IEEE Trans. Parallel Distrib. Syst. 33(1), 70\u201387 (2021)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"4427_CR31","doi-asserted-by":"publisher","unstructured":"Jang, J.-G., Quan, C., Lee, H.D., Kang, U.: Falcon: Lightweight and accurate Convolution based on depthwise separable Convolution. Knowl. Inf. Syst., 65, 5, pp. 2225\u20132249, 2023\/05\/01 2023, https:\/\/doi.org\/10.1007\/s10115-022-01818-x","DOI":"10.1007\/s10115-022-01818-x"},{"key":"4427_CR32","unstructured":"Botinhao, C.V., Wang, X., Takaki, S., Yamagishi, J.: Investigating RNN-based speech enhancement methods for noise-robust text-to-speech, in 9th ISCA speech synthesis workshop, pp. 159\u2013165. (2016)"},{"key":"4427_CR33","doi-asserted-by":"crossref","unstructured":"Thiemann, J., Ito, N., Vincent, E.: The diverse environments multi-channel acoustic noise database (demand): A database of multichannel environmental noise recordings, in Proceedings of Meetings on Acoustics, vol. 19, no. 1: AIP Publishing. (2013)","DOI":"10.1121\/1.4799597"},{"key":"4427_CR34","doi-asserted-by":"crossref","unstructured":"Veaux, C., Yamagishi, J., King, S.: The voice bank corpus: Design, collection and data analysis of a large regional accent speech database, in international conference oriental COCOSDA held jointly with 2013 conference on Asian spoken language research and evaluation (O-COCOSDA\/CASLRE), 2013: IEEE, pp. 1\u20134. (2013)","DOI":"10.1109\/ICSDA.2013.6709856"},{"key":"4427_CR35","doi-asserted-by":"crossref","unstructured":"Kinoshita, K., et al.: A summary of the REVERB challenge: state-of-the-art and remaining challenges in reverberant speech processing research, EURASIP Journal on Advances in Signal Processing, vol. pp. 1\u201319, 2016. (2016)","DOI":"10.1186\/s13634-016-0306-6"},{"key":"4427_CR36","doi-asserted-by":"crossref","unstructured":"Robinson, T., Fransen, J., Pye, D., Foote, J., Renals, S.: WSJCAMO: a British English speech corpus for large vocabulary continuous speech recognition, in International Conference on Acoustics, Speech, and Signal Processing, 1995, vol. 1: IEEE, pp. 81\u201384. (1995)","DOI":"10.1109\/ICASSP.1995.479278"},{"key":"4427_CR37","doi-asserted-by":"crossref","unstructured":"Lincoln, M., McCowan, I., Vepa, J., Maganti, H.K.: The multi-channel wall street journal audio visual corpus (MC-WSJ-AV): Specification and initial experiments. In: IEEE Workshop on Automatic Speech Recognition and Understanding, 2005, pp. 357\u2013362. IEEE (2005)","DOI":"10.1109\/ASRU.2005.1566470"},{"key":"4427_CR38","unstructured":"Yamagishi, J., Veaux, C., MacDonald, K., Corpus, C.S.T.R.V.C.T.K.: English multi-speaker corpus for CSTR voice cloning toolkit (version 0.92), University of Edinburgh. The Centre for Speech Technology Research (CSTR), pp. 271\u2013350, (2019)"},{"key":"4427_CR39","doi-asserted-by":"crossref","unstructured":"Hu, Y., Loizou, P.C.: Evaluation of objective quality measures for speech enhancement, IEEE Transactions on audio, speech, and language processing, vol. 16, no. 1, pp. 229\u2013238, (2007)","DOI":"10.1109\/TASL.2007.911054"},{"key":"4427_CR40","doi-asserted-by":"crossref","unstructured":"Kim, E., Seo, H.: SE-Conformer: Time-Domain Speech Enhancement Using Conformer Interspeech, pp. 2736\u20132740. (2021)","DOI":"10.21437\/Interspeech.2021-2207"},{"key":"4427_CR41","doi-asserted-by":"crossref","unstructured":"Fu, S.-W., et al.: Metricgan+: An improved version of metricgan for speech enhancement, arXiv preprint arXiv:2104.03538, (2021)","DOI":"10.21437\/Interspeech.2021-599"},{"key":"4427_CR42","doi-asserted-by":"crossref","unstructured":"Yu, G., Li, A., Zheng, C., Guo, Y., Wang, Y., Wang, H.: Dual-branch attention-in-attention transformer for single-channel speech enhancement, in ICASSP\u20132022 IEEE international conference on acoustics, speech and signal processing (ICASSP), 2022: IEEE, pp. 7847\u20137851. (2022)","DOI":"10.1109\/ICASSP43922.2022.9746273"},{"key":"4427_CR43","doi-asserted-by":"crossref","unstructured":"Dang, F., Chen, H., Zhang, P.: DPT-FSNet: Dual-path transformer based full-band and sub-band fusion network for speech enhancement, in ICASSP 2022\u20132022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP),: IEEE, pp. 6857\u20136861. (2022)","DOI":"10.1109\/ICASSP43922.2022.9746171"},{"key":"4427_CR44","doi-asserted-by":"crossref","unstructured":"Abdulatif, S., Cao, R., Yang, B.: Cmgan: Conformer-based metric-gan for monaural speech enhancement. IEEE\/ACM Trans. Audio Speech Lang. Process., (2024)","DOI":"10.1109\/TASLP.2024.3393718"},{"key":"4427_CR45","doi-asserted-by":"crossref","unstructured":"Tribolet, J., Noll, P., McDermott, B., Crochiere, R.: A study of complexity and quality of speech waveform coders, in ICASSP\u201978. IEEE International Conference on Acoustics, Speech, and Signal Processing, vol. 3: IEEE, pp. 586\u2013590. (1978)","DOI":"10.1109\/ICASSP.1978.1170567"},{"key":"4427_CR46","doi-asserted-by":"crossref","unstructured":"Hansen, J.H., Pellom, B.L.: An effective quality evaluation protocol for speech enhancement algorithms. In: ICSLP, vol. 7, pp. 2819\u20132822. Citeseer (1998)","DOI":"10.21437\/ICSLP.1998-350"},{"issue":"2","key":"4427_CR47","doi-asserted-by":"publisher","first-page":"242","DOI":"10.1109\/49.601","volume":"6","author":"N Kitawaki","year":"1988","unstructured":"Kitawaki, N., Nagabuchi, H., Itoh, K.: Objective quality evaluation for low-bit-rate speech coding systems. IEEE J. Sel. Areas Commun. 6(2), 242\u2013248 (1988)","journal-title":"IEEE J. Sel. Areas Commun."},{"issue":"7","key":"4427_CR48","doi-asserted-by":"publisher","first-page":"1766","DOI":"10.1109\/TASL.2010.2052247","volume":"18","author":"TH Falk","year":"2010","unstructured":"Falk, T.H., Zheng, C., Chan, W.-Y.: A non-intrusive quality and intelligibility measure of reverberant and dereverberated speech. IEEE Trans. Audio Speech Lang. Process. 18(7), 1766\u20131774 (2010)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"4427_CR49","unstructured":"X. Xiaoet al., The NTU-ADSC systems for reverberation challenge 2014, in Proc. REVERB challenge workshop, 2014: Spoken Language Systems MIT Computer Science and Artificial Intelligence\u2026p. o2"},{"key":"4427_CR50","doi-asserted-by":"crossref","unstructured":"Ernst, O., Chazan, S.E., Gannot, S., Goldberger, J.: Speech dereverberation using fully convolutional networks, in 26th European Signal Processing Conference (EUSIPCO), 2018: IEEE, pp. 390\u2013394. (2018)","DOI":"10.23919\/EUSIPCO.2018.8553141"},{"key":"4427_CR51","unstructured":"Ribas, D., Llombart, J., Miguel, A., Vicente, L.: Deep speech enhancement for reverberated and noisy signals using wide residual networks, arXiv preprint arXiv:.00660, 2019. (1901)"},{"key":"4427_CR52","unstructured":"Kuleshov, V., Enam, S.Z., Ermon, S.: Audio super-resolution using neural nets, in ICLR (Workshop Track), (2017)"},{"key":"4427_CR53","doi-asserted-by":"crossref","unstructured":"Rakotonirina, N.C.: Self-attention for audio super-resolution, in 2021 IEEE 31st International Workshop on Machine Learning for Signal Processing (MLSP),: IEEE, pp. 1\u20136. (2021)","DOI":"10.1109\/MLSP52302.2021.9596082"},{"key":"4427_CR54","doi-asserted-by":"crossref","unstructured":"Lim, T.Y., Yeh, R.A., Xu, Y., Do, M.N., Hasegawa-Johnson, M.: Time-frequency networks for audio super-resolution, in IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2018: IEEE, pp. 646\u2013650. (2018)","DOI":"10.1109\/ICASSP.2018.8462049"},{"key":"4427_CR55","doi-asserted-by":"crossref","unstructured":"Wang, H., Wang, D.: Towards robust speech super-resolution, IEEE\/ACM transactions on audio, speech, and language processing, vol. 29, pp. 2058\u20132066, (2021)","DOI":"10.1109\/TASLP.2021.3054302"}],"container-title":["Signal, Image and Video Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-025-04427-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11760-025-04427-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-025-04427-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,7]],"date-time":"2025-09-07T20:24:38Z","timestamp":1757276678000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11760-025-04427-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,23]]},"references-count":55,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2025,11]]}},"alternative-id":["4427"],"URL":"https:\/\/doi.org\/10.1007\/s11760-025-04427-7","relation":{},"ISSN":["1863-1703","1863-1711"],"issn-type":[{"value":"1863-1703","type":"print"},{"value":"1863-1711","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,7,23]]},"assertion":[{"value":"25 April 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 June 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 June 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 July 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Not applicable.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}},{"value":"The authors declare no competing interests.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"902"}}