{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T15:23:28Z","timestamp":1759332208871,"version":"3.40.3"},"publisher-location":"Cham","reference-count":39,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031780134"},{"type":"electronic","value":"9783031780141"}],"license":[{"start":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T00:00:00Z","timestamp":1732233600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T00:00:00Z","timestamp":1732233600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78014-1_18","type":"book-chapter","created":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T12:23:42Z","timestamp":1732191822000},"page":"238-249","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Exploring MetaConformer for\u00a0Speech Enhancement"],"prefix":"10.1007","author":[{"given":"Lukas","family":"F\u00f6rner","sequence":"first","affiliation":[]},{"given":"Maximilian","family":"Dauner","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,22]]},"reference":[{"key":"18_CR1","unstructured":"Perceptual Evaluation of Speech Quality (PESQ): An Objective Method for End-to-End Speech Quality Assessment of Narrow-Band Telephone Networks and Speech Codecs. Technical report, P. 862, ITU-T, February 2001. https:\/\/www.itu.int\/rec\/T-REC-P.862\/en"},{"key":"18_CR2","unstructured":"Wideband Extension to Recommendation P.862 for the Assessment of Wideband Telephone Networks and Speech Codecs. Technical report, P.862.2, ITU-T, November 2007. https:\/\/www.itu.int\/rec\/T-REC-P.862.2"},{"key":"18_CR3","doi-asserted-by":"publisher","unstructured":"Cao, R., Abdulatif, S., Yang, B.: CMGAN: conformer-based metric GAN for speech enhancement. In: Interspeech 2022, pp. 936\u2013940, September 2022. https:\/\/doi.org\/10.21437\/Interspeech.2022-517. http:\/\/arxiv.org\/abs\/2203.15149","DOI":"10.21437\/Interspeech.2022-517"},{"key":"18_CR4","doi-asserted-by":"crossref","unstructured":"Chao, R., Yu, C., Fu, S.W., Lu, X., Tsao, Y.: Perceptual Contrast Stretching on Target Feature for Speech Enhancement, July 2022. http:\/\/arxiv.org\/abs\/2203.17152","DOI":"10.21437\/Interspeech.2022-10478"},{"key":"18_CR5","unstructured":"Chen, S., et al.: Continuous speech separation with conformer, October 2020. http:\/\/arxiv.org\/abs\/2008.05773"},{"key":"18_CR6","doi-asserted-by":"crossref","unstructured":"Dang, F., Chen, H., Zhang, P.: DPT-FSNet: dual-path transformer based full-band and sub-band fusion network for speech enhancement, January 2022. http:\/\/arxiv.org\/abs\/2104.13002","DOI":"10.1109\/ICASSP43922.2022.9746171"},{"key":"18_CR7","unstructured":"Fu, S.W., Liao, C.F., Tsao, Y., Lin, S.D.: MetricGAN: generative adversarial networks based black-box metric scores optimization for speech enhancement, May 2019. http:\/\/arxiv.org\/abs\/1905.04874"},{"key":"18_CR8","doi-asserted-by":"crossref","unstructured":"Fu, S.W., et al.: MetricGAN+: an improved version of MetricGAN for speech enhancement, June 2021. http:\/\/arxiv.org\/abs\/2104.03538","DOI":"10.21437\/Interspeech.2021-599"},{"key":"18_CR9","doi-asserted-by":"crossref","unstructured":"Gulati, A., et al.: Conformer: convolution-augmented transformer for speech recognition, May 2020. http:\/\/arxiv.org\/abs\/2005.08100","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"18_CR10","doi-asserted-by":"crossref","unstructured":"Hsieh, T.A., Yu, C., Fu, S.W., Lu, X., Tsao, Y.: Improving perceptual quality by phone-fortified perceptual loss using Wasserstein distance for speech enhancement, April 2021. http:\/\/arxiv.org\/abs\/2010.15174","DOI":"10.21437\/Interspeech.2021-582"},{"key":"18_CR11","doi-asserted-by":"publisher","unstructured":"Hu, Y., Loizou, P.C.: Evaluation of objective measures for speech enhancement. In: Interspeech 2006, pp. paper 2007\u2013Tue3FoP.10\u20130. ISCA, September 2006. https:\/\/doi.org\/10.21437\/Interspeech.2006-84. https:\/\/www.isca-speech.org\/archive\/interspeech_2006\/hu06_interspeech.html","DOI":"10.21437\/Interspeech.2006-84"},{"key":"18_CR12","doi-asserted-by":"publisher","unstructured":"Hu, Y., Loizou, P.C.: Evaluation of objective quality measures for speech enhancement. IEEE Trans. Audio Speech Lang. Process. 16(1), 229\u2013238 (2008). https:\/\/doi.org\/10.1109\/TASL.2007.911054. http:\/\/ieeexplore.ieee.org\/document\/4389058\/","DOI":"10.1109\/TASL.2007.911054"},{"key":"18_CR13","doi-asserted-by":"publisher","unstructured":"Kim, E., Seo, H.: SE-conformer: time-domain speech enhancement using conformer. In: Interspeech 2021, pp. 2736\u20132740. ISCA, August 2021. https:\/\/doi.org\/10.21437\/Interspeech.2021-2207. https:\/\/www.isca-archive.org\/interspeech_2021\/kim21h_interspeech.html","DOI":"10.21437\/Interspeech.2021-2207"},{"key":"18_CR14","doi-asserted-by":"publisher","unstructured":"Krishnamoorthy, P.: An overview of subjective and objective quality measures for noisy speech enhancement algorithms. IETE Tech. Rev. 28(4),\u00a0292 (2011). https:\/\/doi.org\/10.4103\/0256-4602.83550. http:\/\/tr.ietejournals.org\/text.asp?2011\/28\/4\/292\/83550","DOI":"10.4103\/0256-4602.83550"},{"key":"18_CR15","doi-asserted-by":"crossref","unstructured":"Lee-Thorp, J., Ainslie, J., Eckstein, I., Ontanon, S.: FNet: mixing tokens with Fourier transforms, May 2022. http:\/\/arxiv.org\/abs\/2105.03824","DOI":"10.18653\/v1\/2022.naacl-main.319"},{"key":"18_CR16","doi-asserted-by":"crossref","unstructured":"Lin, J., van Wijngaarden, A.J., Wang, K.C., Smith, M.C.: Speech enhancement using multi-stage self-attentive temporal convolutional networks, February 2021. http:\/\/arxiv.org\/abs\/2102.12078","DOI":"10.1109\/TASLP.2021.3125143"},{"key":"18_CR17","unstructured":"Liu, H., Dai, Z., So, D.R., Le, Q.V.: Pay attention to MLPs (2021). https:\/\/arxiv.org\/abs\/2105.08050"},{"key":"18_CR18","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization, January 2019. http:\/\/arxiv.org\/abs\/1711.05101"},{"key":"18_CR19","doi-asserted-by":"crossref","unstructured":"Lu, Y.X., Ai, Y., Ling, Z.H.: MP-SENet: a speech enhancement model with parallel denoising of magnitude and phase spectra. arXiv:2305.13686, arXiv, May 2023. http:\/\/arxiv.org\/abs\/2305.13686","DOI":"10.21437\/Interspeech.2023-1441"},{"key":"18_CR20","doi-asserted-by":"crossref","unstructured":"Lu, Y.X., Ai, Y., Ling, Z.H.: MP-SENet: a speech enhancement model with parallel denoising of magnitude and phase spectra, May 2023. http:\/\/arxiv.org\/abs\/2305.13686","DOI":"10.21437\/Interspeech.2023-1441"},{"key":"18_CR21","unstructured":"Martins, P.H., Marinho, Z., Martins, A.F.T.: $$\\infty $$-former: infinite memory transformer (2022). https:\/\/arxiv.org\/abs\/2109.00301"},{"key":"18_CR22","unstructured":"Miyazaki, K., Komatsu, T., Hayashi, T., Watanabe, S., Toda, T., Takeda, K.: Convolution-augmented transformer for semi-supervised sound event detection technical report (2020). https:\/\/api.semanticscholar.org\/CorpusID:221082840"},{"key":"18_CR23","doi-asserted-by":"crossref","unstructured":"Pascual, S., Bonafonte, A., Serr\u00e0, J.: SEGAN: speech enhancement generative adversarial network, June 2017. http:\/\/arxiv.org\/abs\/1703.09452","DOI":"10.21437\/Interspeech.2017-1428"},{"key":"18_CR24","doi-asserted-by":"publisher","unstructured":"Rix, A., Beerends, J., Hollier, M., Hekstra, A.: Perceptual evaluation of speech quality (PESQ)-a new method for speech quality assessment of telephone networks and codecs. In: 2001 IEEE International Conference on Acoustics, Speech, and Signal Processing. Proceedings (Cat. No.01CH37221), vol.\u00a02, pp. 749\u2013752. IEEE, Salt Lake City, UT, USA (2001). https:\/\/doi.org\/10.1109\/ICASSP.2001.941023. http:\/\/ieeexplore.ieee.org\/document\/941023\/","DOI":"10.1109\/ICASSP.2001.941023"},{"key":"18_CR25","doi-asserted-by":"crossref","unstructured":"Smith, L.N., Topin, N.: Super-convergence: very fast training of neural networks using large learning rates, May 2018. http:\/\/arxiv.org\/abs\/1708.07120","DOI":"10.1117\/12.2520589"},{"key":"18_CR26","unstructured":"Stoller, D., Ewert, S., Dixon, S.: Wave-U-Net: a multi-scale neural network for end-to-end audio source separation, June 2018. http:\/\/arxiv.org\/abs\/1806.03185"},{"key":"18_CR27","doi-asserted-by":"publisher","unstructured":"Taal, C.H., Hendriks, R.C., Heusdens, R., Jensen, J.: An algorithm for intelligibility prediction of time\u2013frequency weighted noisy speech. IEEE Trans. Audio Speech Lang. Process. 19(7), 2125\u20132136 (2011). https:\/\/doi.org\/10.1109\/TASL.2011.2114881. http:\/\/ieeexplore.ieee.org\/document\/5713237\/","DOI":"10.1109\/TASL.2011.2114881"},{"key":"18_CR28","doi-asserted-by":"crossref","unstructured":"Tay, Y., Dehghani, M., Bahri, D., Metzler, D.: Efficient transformers: a survey (2022)","DOI":"10.1145\/3530811"},{"key":"18_CR29","doi-asserted-by":"publisher","unstructured":"Thiemann, J., Ito, N., Vincent, E.: The diverse environments multi-channel acoustic noise database (DEMAND): a database of multichannel environmental noise recordings. Proc. Meet. Acoust. 19(1), 035081 (2013). https:\/\/doi.org\/10.1121\/1.4799597","DOI":"10.1121\/1.4799597"},{"key":"18_CR30","unstructured":"Tolstikhin, I., et al.: MLP-mixer: an all-MLP architecture for vision (2021). https:\/\/arxiv.org\/abs\/2105.01601"},{"key":"18_CR31","doi-asserted-by":"publisher","unstructured":"Valentini-Botinhao, C., Wang, X., Takaki, S., Yamagishi, J.: Investigating RNN-based speech enhancement methods for noise-robust text-to-speech. In: Proceedings. 9th ISCA Workshop on Speech Synthesis Workshop (SSW 9), pp. 146\u2013152 (2016). https:\/\/doi.org\/10.21437\/SSW.2016-24","DOI":"10.21437\/SSW.2016-24"},{"key":"18_CR32","unstructured":"Vaswani, A., et al.: Attention is all You need, August 2023. http:\/\/arxiv.org\/abs\/1706.03762"},{"key":"18_CR33","doi-asserted-by":"publisher","unstructured":"Veaux, C., Yamagishi, J., King, S.: The voice bank corpus: design, collection and data analysis of a large regional accent speech database. In: 2013 International Conference Oriental COCOSDA Held Jointly with 2013 Conference on Asian Spoken Language Research and Evaluation (O-COCOSDA\/CASLRE), pp.\u00a01\u20134. IEEE, Gurgaon, India, November 2013. https:\/\/doi.org\/10.1109\/ICSDA.2013.6709856. http:\/\/ieeexplore.ieee.org\/document\/6709856\/","DOI":"10.1109\/ICSDA.2013.6709856"},{"key":"18_CR34","unstructured":"Wang, K., He, B., Zhu, W.P.: TSTNN: two-stage transformer based neural network for speech enhancement in the time domain, March 2021. http:\/\/arxiv.org\/abs\/2103.09963"},{"key":"18_CR35","unstructured":"Wang, S., Li, B.Z., Khabsa, M., Fang, H., Ma, H.: Linformer: self-attention with linear complexity, June 2020. http:\/\/arxiv.org\/abs\/2006.04768"},{"key":"18_CR36","unstructured":"Yin, D., Luo, C., Xiong, Z., Zeng, W.: PHASEN: a phase-and-harmonics-aware speech enhancement network, November 2019. http:\/\/arxiv.org\/abs\/1911.04697"},{"key":"18_CR37","doi-asserted-by":"crossref","unstructured":"Yu, G., Li, A., Zheng, C., Guo, Y., Wang, Y., Wang, H.: Dual-branch attention-in-attention transformer for single-channel speech enhancement, February 2022. http:\/\/arxiv.org\/abs\/2110.06467","DOI":"10.1109\/ICASSP43922.2022.9746273"},{"key":"18_CR38","doi-asserted-by":"crossref","unstructured":"Yu, W., et al.: MetaFormer is actually what you need for vision, July 2022. http:\/\/arxiv.org\/abs\/2111.11418","DOI":"10.1109\/CVPR52688.2022.01055"},{"key":"18_CR39","doi-asserted-by":"publisher","unstructured":"Yu, W., et al.: MetaFormer baselines for vision. IEEE Trans. Pattern Anal. Mach. Intell. 46(2), 896\u2013912 (2024). https:\/\/doi.org\/10.1109\/TPAMI.2023.3329173. http:\/\/arxiv.org\/abs\/2210.13452","DOI":"10.1109\/TPAMI.2023.3329173"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78014-1_18","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T13:06:25Z","timestamp":1732194385000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78014-1_18"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,22]]},"ISBN":["9783031780134","9783031780141"],"references-count":39,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78014-1_18","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,22]]},"assertion":[{"value":"22 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"SPECOM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Speech and Computer","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Belgrade","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Serbia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25 November 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 November 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"specom2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/specom2024.ftn.uns.ac.rs\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}