{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:23:02Z","timestamp":1760314982170,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":36,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032079589","type":"print"},{"value":"9783032079596","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:00:00Z","timestamp":1760313600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:00:00Z","timestamp":1760313600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-07959-6_13","type":"book-chapter","created":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T09:22:00Z","timestamp":1760260920000},"page":"174-188","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["CrossMP-SENet: Transformer-Based Cross-Attention for\u00a0Joint Magnitude-Phase Speech Enhancement"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-9633-7569","authenticated-orcid":false,"given":"Alexander","family":"Zaburdaev","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0412-7765","authenticated-orcid":false,"given":"Denis","family":"Ivanko","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7935-0569","authenticated-orcid":false,"given":"Dmitry","family":"Ryumin","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,10,13]]},"reference":[{"key":"13_CR1","doi-asserted-by":"publisher","unstructured":"Abdulatif, S., Cao, R., Yang, B.: CMGAN: conformer-based metric-GAN for monaural speech enhancement. IEEE\/ACM Trans. Audio Speech Lang. Process. 2477\u20132493 (2024). https:\/\/doi.org\/10.1109\/TASLP.2024.3393718","DOI":"10.1109\/TASLP.2024.3393718"},{"issue":"10","key":"13_CR2","doi-asserted-by":"publisher","first-page":"29859","DOI":"10.1007\/s11042-023-16665-3","volume":"83","author":"M Anees","year":"2024","unstructured":"Anees, M.: Speech coding techniques and challenges: a comprehensive literature survey. Multimed. Tools Appl. 83(10), 29859\u201329879 (2024). https:\/\/doi.org\/10.1007\/s11042-023-16665-3","journal-title":"Multimed. Tools Appl."},{"key":"13_CR3","doi-asserted-by":"publisher","unstructured":"Axyonov, A., Ryumin, D., Ivanko, D., Kashevnik, A., Karpov, A.: Audio-visual speech recognition in-the-wild: multi-angle vehicle cabin corpus and attention-based method. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 8195\u20138199. IEEE (2024). https:\/\/doi.org\/10.1109\/ICASSP48485.2024.10448048","DOI":"10.1109\/ICASSP48485.2024.10448048"},{"key":"13_CR4","doi-asserted-by":"publisher","unstructured":"Botinhao, C.V., Wang, X., Takaki, S., Yamagishi, J.: Investigating RNN-based speech enhancement methods for noise-robust text-to-speech. In: ISCA Speech Synthesis Workshop (SSW), pp. 159\u2013165 (2016). https:\/\/doi.org\/10.21437\/SSW.2016-24","DOI":"10.21437\/SSW.2016-24"},{"key":"13_CR5","doi-asserted-by":"publisher","unstructured":"Chao, R., et al.: An investigation of incorporating mamba for speech enhancement. In: IEEE Spoken Language Technology Workshop (SLTW), pp. 302\u2013308 (2024). https:\/\/doi.org\/10.1109\/SLT61566.2024.10832332","DOI":"10.1109\/SLT61566.2024.10832332"},{"issue":"4","key":"13_CR6","doi-asserted-by":"publisher","first-page":"883","DOI":"10.1007\/s10772-020-09674-2","volume":"24","author":"N Das","year":"2020","unstructured":"Das, N., Chakraborty, S., Chaki, J., Padhy, N., Dey, N.: Fundamentals, present and future perspectives of speech enhancement. Int. J. Speech Technol. 24(4), 883\u2013901 (2020). https:\/\/doi.org\/10.1007\/s10772-020-09674-2","journal-title":"Int. J. Speech Technol."},{"key":"13_CR7","doi-asserted-by":"publisher","unstructured":"He, J., Gao, Y., Zhang, T., Zhang, Z., Wu, F.: D$$^{2}$$Former: jointly learning hierarchical detectors and contextual descriptors via agent-based transformers. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2904\u20132914 (2023). https:\/\/doi.org\/10.1109\/CVPR52729.2023.00284","DOI":"10.1109\/CVPR52729.2023.00284"},{"key":"13_CR8","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"291","DOI":"10.1007\/978-3-030-87802-3_27","volume-title":"Speech and Computer","author":"D Ivanko","year":"2021","unstructured":"Ivanko, D., Ryumin, D., Axyonov, A., Kashevnik, A.: Speaker-dependent visual command recognition in vehicle cabin: methodology and evaluation. In: Karpov, A., Potapova, R. (eds.) SPECOM 2021. LNCS (LNAI), vol. 12997, pp. 291\u2013302. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-87802-3_27"},{"key":"13_CR9","unstructured":"Ivanko, D., et al.: DAVIS: driver\u2019s audio-visual speech recognition. In: INTERSPEECH, pp. 1141\u20131142 (2022)"},{"issue":"01","key":"13_CR10","doi-asserted-by":"publisher","first-page":"2550001","DOI":"10.1142\/S0219467825500019","volume":"25","author":"C Jannu","year":"2025","unstructured":"Jannu, C., Vanambathina, S.D.: An overview of speech enhancement based on deep learning techniques. Int. J. Image Graph. 25(01), 2550001 (2025). https:\/\/doi.org\/10.1142\/S0219467825500019","journal-title":"Int. J. Image Graph."},{"key":"13_CR11","doi-asserted-by":"crossref","unstructured":"Khan, M.S., La\u00a0Quatra, M., Hung, K.H., Fu, S.W., Siniscalchi, S.M., Tsao, Y.: Exploiting consistency-preserving loss and perceptual contrast stretching to boost ssl-based speech enhancement. In: International Workshop on Multimedia Signal Processing (MMSP), pp.\u00a01\u20136 (2024)","DOI":"10.1109\/MMSP61759.2024.10743615"},{"key":"13_CR12","doi-asserted-by":"publisher","unstructured":"Kim, E., Seo, H.: SE-conformer: time-domain speech enhancement using conformer. In: INTERSPEECH, pp. 2736\u20132740 (2021). https:\/\/doi.org\/10.21437\/Interspeech.2021-2207","DOI":"10.21437\/Interspeech.2021-2207"},{"key":"13_CR13","unstructured":"K\u00fchne, N.L., \u00d8stergaard, J., Jensen, J., Tan, Z.H.: xLSTM-SENet: xLSTM for single-channel speech enhancement. In: INTERSPEECH (2025)"},{"key":"13_CR14","doi-asserted-by":"publisher","unstructured":"Lin, Z., Wang, J., Li, R., Shen, F., Xuan, X.: PrimeK-net: multi-scale spectral learning via group prime-kernel convolutional neural networks for single channel speech enhancement. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135 (2025). https:\/\/doi.org\/10.1109\/ICASSP49660.2025.10890034","DOI":"10.1109\/ICASSP49660.2025.10890034"},{"key":"13_CR15","doi-asserted-by":"publisher","unstructured":"Liu, J., Li, Z.: TF-transformer: temporal-frequency transformer for OFDM signal recognition. In: IEEE Wireless Communications and Networking Conference (WCNC), pp.\u00a01\u20136 (2025). https:\/\/doi.org\/10.1109\/WCNC61545.2025.10978396","DOI":"10.1109\/WCNC61545.2025.10978396"},{"key":"13_CR16","doi-asserted-by":"publisher","unstructured":"Lu, Y.X., Ai, Y., Ling, Z.H.: MP-SENet: a speech enhancement model with parallel denoising of magnitude and phase spectra. In: INTERSPEECH, pp. 3834\u20133838 (2023). https:\/\/doi.org\/10.21437\/Interspeech.2023-1441","DOI":"10.21437\/Interspeech.2023-1441"},{"key":"13_CR17","doi-asserted-by":"publisher","unstructured":"Lu, Y.X., Ai, Y., Ling, Z.H.: Explicit estimation of magnitude and phase spectra in parallel for high-quality speech enhancement. Neural Netw. 107562 (2025). https:\/\/doi.org\/10.1016\/j.neunet.2025.107562","DOI":"10.1016\/j.neunet.2025.107562"},{"key":"13_CR18","doi-asserted-by":"publisher","unstructured":"de\u00a0Oliveira, D., Welker, S., Richter, J., Gerkmann, T.: The PESQetarian: on the relevance of goodhart\u2019s law for speech enhancement. In: INTERSPEECH, pp. 3854\u20133858 (2024). https:\/\/doi.org\/10.21437\/Interspeech.2024-2051","DOI":"10.21437\/Interspeech.2024-2051"},{"issue":"1","key":"13_CR19","doi-asserted-by":"publisher","first-page":"110","DOI":"10.1109\/THMS.2023.3339663","volume":"54","author":"D O\u2019Shaughnessy","year":"2024","unstructured":"O\u2019Shaughnessy, D.: Speech enhancement - a review of modern methods. IEEE Trans. Hum.-Mach. Syst. 54(1), 110\u2013120 (2024). https:\/\/doi.org\/10.1109\/THMS.2023.3339663","journal-title":"IEEE Trans. Hum.-Mach. Syst."},{"key":"13_CR20","doi-asserted-by":"publisher","unstructured":"Pandey, A., Wang, D.: Densely connected neural network with dilated convolutions for real-time speech enhancement in the time domain. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6629\u20136633 (2020). https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9054536","DOI":"10.1109\/ICASSP40776.2020.9054536"},{"key":"13_CR21","doi-asserted-by":"publisher","unstructured":"Pirklbauer, J., et al.: Evaluation metrics for generative speech enhancement methods: issues and perspectives. In: Speech Communication; ITG Conference, pp. 265\u2013269. VDE (2023). https:\/\/doi.org\/10.30420\/456164052","DOI":"10.30420\/456164052"},{"key":"13_CR22","doi-asserted-by":"publisher","first-page":"209","DOI":"10.5194\/isprs-archives-XLVIII-2-W3-2023-209-2023","volume":"48","author":"D Ryumin","year":"2023","unstructured":"Ryumin, D., Ivanko, D., Axyonov, A.: Cross-language transfer learning using visual information for automatic sign gesture recognition. Int. Arch. Photogramm. Remote. Sens. Spat. Inf. Sci. 48, 209\u2013216 (2023). https:\/\/doi.org\/10.5194\/isprs-archives-XLVIII-2-W3-2023-209-2023","journal-title":"Int. Arch. Photogramm. Remote. Sens. Spat. Inf. Sci."},{"key":"13_CR23","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2024.124159","volume":"252","author":"D Ryumin","year":"2024","unstructured":"Ryumin, D., Axyonov, A., Ryumina, E., Ivanko, D., Kashevnik, A., Karpov, A.: Audio-visual speech recognition based on regulated transformer and spatio-temporal fusion strategy for driver assistive systems. Expert Syst. Appl. 252, 124159 (2024). https:\/\/doi.org\/10.1016\/j.eswa.2024.124159","journal-title":"Expert Syst. Appl."},{"issue":"4","key":"13_CR24","doi-asserted-by":"publisher","first-page":"2284","DOI":"10.3390\/s23042284","volume":"23","author":"D Ryumin","year":"2023","unstructured":"Ryumin, D., Ivanko, D., Ryumina, E.: Audio-visual speech and gesture recognition by sensors of mobile devices. Sensors 23(4), 2284 (2023). https:\/\/doi.org\/10.3390\/s23042284","journal-title":"Sensors"},{"key":"13_CR25","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.122441","volume":"239","author":"E Ryumina","year":"2024","unstructured":"Ryumina, E., Markitantov, M., Ryumin, D., Karpov, A.: Ocean-AI framework with emoformer cross-hemiface attention approach for personality traits assessment. Expert Syst. Appl. 239, 122441 (2024). https:\/\/doi.org\/10.1016\/j.eswa.2023.122441","journal-title":"Expert Syst. Appl."},{"key":"13_CR26","doi-asserted-by":"publisher","first-page":"192","DOI":"10.1016\/j.patrec.2025.02.024","volume":"190","author":"E Ryumina","year":"2025","unstructured":"Ryumina, E., Ryumin, D., Axyonov, A., Ivanko, D., Karpov, A.: Multi-corpus emotion recognition method based on cross-modal gated attention fusion. Pattern Recogn. Lett. 190, 192\u2013200 (2025). https:\/\/doi.org\/10.1016\/j.patrec.2025.02.024","journal-title":"Pattern Recogn. Lett."},{"key":"13_CR27","doi-asserted-by":"publisher","unstructured":"Schroter, H., Escalante-B, A.N., Rosenkranz, T., Maier, A.: DeepFilterNet: a low complexity speech enhancement framework for full-band audio based on deep filtering. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 7407\u20137411 (2022). https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9747055","DOI":"10.1109\/ICASSP43922.2022.9747055"},{"key":"13_CR28","doi-asserted-by":"publisher","unstructured":"Thiemann, J., Ito, N., Vincent, E.: DEMAND: A Collection of Multi-channel Recordings of Acoustic Noise in Diverse Environments (2013). https:\/\/doi.org\/10.5281\/zenodo.1227121","DOI":"10.5281\/zenodo.1227121"},{"key":"13_CR29","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems (NeurIPS), pp. 6000\u20136010 (2017)"},{"key":"13_CR30","doi-asserted-by":"publisher","unstructured":"Veaux, C., Yamagishi, J., King, S.: The voice bank corpus: design, collection and data analysis of a large regional accent speech database. In: IEEE International Conference Oriental COCOSDA Held Jointly with Conference on Asian Spoken Language Research and Evaluation (O-COCOSDA\/CASLRE), pp.\u00a01\u20134 (2013). https:\/\/doi.org\/10.1109\/ICSDA.2013.6709856","DOI":"10.1109\/ICSDA.2013.6709856"},{"key":"13_CR31","doi-asserted-by":"publisher","unstructured":"Wang, H., Tian, B.: ZipEnhancer: dual-path down-up sampling-based zipformer for monaural speech enhancement. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135 (2025). https:\/\/doi.org\/10.1109\/ICASSP49660.2025.10888703","DOI":"10.1109\/ICASSP49660.2025.10888703"},{"key":"13_CR32","doi-asserted-by":"publisher","unstructured":"Wang, J., Lin, Z., Wang, T., Ge, M., Wang, L., Dang, J.: Mamba-SEUNet: mamba UNet for monaural speech enhancement. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135 (2025). https:\/\/doi.org\/10.1109\/ICASSP49660.2025.10889525","DOI":"10.1109\/ICASSP49660.2025.10889525"},{"key":"13_CR33","doi-asserted-by":"publisher","unstructured":"Wang, Z.Q., Cornell, S., Choi, S., Lee, Y., Kim, B.Y., Watanabe, S.: TF-GridNet: making time-frequency domain models great again for monaural speaker separation. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135 (2023). https:\/\/doi.org\/10.1109\/ICASSP49357.2023.10094992","DOI":"10.1109\/ICASSP49357.2023.10094992"},{"key":"13_CR34","doi-asserted-by":"publisher","unstructured":"Yang, L., Liu, W., Meng, R., Lee, G., Baek, S., Moon, H.G.: FSPEN: an ultra-lightweight network for real time speech enhancement. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 10671\u201310675 (2024). https:\/\/doi.org\/10.1109\/ICASSP48485.2024.10446016","DOI":"10.1109\/ICASSP48485.2024.10446016"},{"key":"13_CR35","doi-asserted-by":"publisher","unstructured":"Yin, D., Luo, C., Xiong, Z., Zeng, W.: PHASEN: a phase-and-harmonics-aware speech enhancement network. In: AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 9458\u20139465 (2020). https:\/\/doi.org\/10.1609\/AAAI.V34I05.6489","DOI":"10.1609\/AAAI.V34I05.6489"},{"key":"13_CR36","doi-asserted-by":"publisher","unstructured":"Zadorozhnyy, V., Ye, Q., Koishida, K.: SCP-GAN: self-correcting discriminator optimization for training consistency preserving metric GAN on speech enhancement tasks. In: INTERSPEECH, pp. 2463\u20132467 (2023). https:\/\/doi.org\/10.21437\/Interspeech.2023-456","DOI":"10.21437\/Interspeech.2023-456"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-07959-6_13","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T09:22:03Z","timestamp":1760260923000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-07959-6_13"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,13]]},"ISBN":["9783032079589","9783032079596"],"references-count":36,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-07959-6_13","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,10,13]]},"assertion":[{"value":"13 October 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"SPECOM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Speech and Computer","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Szeged","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hungary","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"specom2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/specom.inf.u-szeged.hu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}