{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:41:28Z","timestamp":1755823288085,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":72,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Post-graduate Research & Practice Innovation Program of Jiangsu Province","award":["KYCX22 3668"],"award-info":[{"award-number":["KYCX22 3668"]}]},{"name":"Key Project of National Nature Science Foundation of China","award":["U1836220"],"award-info":[{"award-number":["U1836220"]}]},{"name":"National Nature Science Foundation of China","award":["62176106"],"award-info":[{"award-number":["62176106"]}]},{"name":"Jiangsu key research and development plan","award":["BE2020036"],"award-info":[{"award-number":["BE2020036"]}]},{"name":"MTRAC Grant for Advanced Computing Technologies"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612173","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"601-610","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["TE-KWS: Text-Informed Speech Enhancement for Noise-Robust Keyword Spotting"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2286-5756","authenticated-orcid":false,"given":"Dong","family":"Liu","sequence":"first","affiliation":[{"name":"Jiangsu University &amp; Shandong Youth University of Political Science, Zhenjiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0616-4431","authenticated-orcid":false,"given":"Qirong","family":"Mao","sequence":"additional","affiliation":[{"name":"Jiangsu University, Zhenjiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6458-0660","authenticated-orcid":false,"given":"Lijian","family":"Gao","sequence":"additional","affiliation":[{"name":"Jiangsu University, Zhenjiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3441-4514","authenticated-orcid":false,"given":"Qinghua","family":"Ren","sequence":"additional","affiliation":[{"name":"Jiangsu University, Zhenjiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1841-539X","authenticated-orcid":false,"given":"Zhenghan","family":"Chen","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8133-7809","authenticated-orcid":false,"given":"Ming","family":"Dong","sequence":"additional","affiliation":[{"name":"Wayne State University, Detroit, MI, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"CMGAN: Conformer-Based Metric-GAN for Monaural Speech Enhancement. arXiv preprint arXiv:2209.11112","author":"Abdulatif Sherif","year":"2022","unstructured":"Sherif Abdulatif, Ruizhe Cao, and Bin Yang. 2022. CMGAN: Conformer-Based Metric-GAN for Monaural Speech Enhancement. arXiv preprint arXiv:2209.11112 (2022)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO.2017.8081267"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58598-3_38"},{"key":"e_1_3_2_1_4_1","volume-title":"Hainan Xu, and Shinji Watanabe.","author":"Chen Szu-Jui","year":"2018","unstructured":"Szu-Jui Chen, Aswin Shanmugam Subramanian, Hainan Xu, and Shinji Watanabe. 2018. Building state-of-the-art distant speech recognition using the CHiME-4 challenge with a setup of speech enhancement baseline. arXiv preprint arXiv:1803.10109 (2018)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952155"},{"key":"e_1_3_2_1_6_1","first-page":"3372","article-title":"Temporal Convolution for Real-Time Keyword Spotting on Mobile Devices","volume":"2019","author":"Choi Seungwoo","year":"2019","unstructured":"Seungwoo Choi, Seokjun Seo, Beomjun Shin, Hyeongmin Byun, Martin Kersner, Beomsu Kim, Dongyoung Kim, and Sungjoo Ha. 2019. Temporal Convolution for Real-Time Keyword Spotting on Mobile Devices. Proc. Interspeech 2019 (2019), 3372--3376.","journal-title":"Proc. Interspeech"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683474"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746171"},{"key":"e_1_3_2_1_9_1","volume-title":"Martin Loesener Da Silva Viana, and Christoph Bernkopf","author":"De Andrade Douglas Coimbra","year":"2018","unstructured":"Douglas Coimbra De Andrade, Sabato Leo, Martin Loesener Da Silva Viana, and Christoph Bernkopf. 2018. A neural attention model for speech command recognition. arXiv preprint arXiv:1808.08929 (2018)."},{"key":"e_1_3_2_1_10_1","volume-title":"Real time speech enhancement in the waveform domain. arXiv preprint arXiv:2006.12847","author":"Defossez Alexandre","year":"2020","unstructured":"Alexandre Defossez, Gabriel Synnaeve, and Yossi Adi. 2020. Real time speech enhancement in the waveform domain. arXiv preprint arXiv:2006.12847 (2020)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.01004"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178061"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3056212"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746020"},{"key":"e_1_3_2_1_15_1","volume-title":"Domain-adversarial training of neural networks. The journal of machine learning research 17, 1","author":"Ganin Yaroslav","year":"2016","unstructured":"Yaroslav Ganin, Evgeniya Ustinova, Hana Ajakan, Pascal Germain, Hugo Larochelle, Fran\u00e7ois Laviolette, Mario Marchand, and Victor Lempitsky. 2016. Domain-adversarial training of neural networks. The journal of machine learning research 17, 1 (2016), 2096-2030."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7471631"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the 2001 IEEE Workshop on the Applications of Signal Processing to Audio and Acoustics (Cat. No. 01TH8575)","author":"Hu Guoning","year":"2001","unstructured":"Guoning Hu and DeLiang Wang. 2001. Speech segregation based on pitch tracking and amplitude modulation. In Proceedings of the 2001 IEEE Workshop on the Applications of Signal Processing to Audio and Acoustics (Cat. No. 01TH8575). IEEE, 79--82."},{"key":"e_1_3_2_1_19_1","volume-title":"DCCRN: Deep complex convolution recurrent network for phase-aware speech enhancement. arXiv preprint arXiv:2008.00264","author":"Hu Yanxin","year":"2020","unstructured":"Yanxin Hu, Yun Liu, Shubo Lv, Mengtao Xing, Shimin Zhang, Yihui Fu, Jian Wu, Bihong Zhang, and Lei Xie. 2020. DCCRN: Deep complex convolution recurrent network for phase-aware speech enhancement. arXiv preprint arXiv:2008.00264 (2020)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2007.911054"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054538"},{"key":"e_1_3_2_1_23_1","first-page":"2195","article-title":"Sub-Band Convolutional Neural Networks for Small-Footprint Spoken Term Classification","volume":"2019","author":"Kao Chieh-Chi","year":"2019","unstructured":"Chieh-Chi Kao, Ming Sun, Yixin Gao, Shiv Vitaladevuni, and Chao Wang. 2019. Sub-Band Convolutional Neural Networks for Small-Footprint Spoken Term Classification. Proc. Interspeech 2019 (2019), 2195--2199.","journal-title":"Proc. Interspeech"},{"key":"e_1_3_2_1_24_1","volume-title":"Broadcasted residual learning for efficient keyword spotting. arXiv preprint arXiv:2106.04140","author":"Kim Byeonggeun","year":"2021","unstructured":"Byeonggeun Kim, Simyung Chang, Jinkyu Lee, and Dooyong Sung. 2021. Broadcasted residual learning for efficient keyword spotting. arXiv preprint arXiv:2106.04140 (2021)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Eesung Kim and Hyeji Seo. 2021. SE-Conformer: Time-Domain Speech Enhancement Using Conformer.. In Interspeech. 2736--2740.","DOI":"10.21437\/Interspeech.2021-2207"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Keisuke Kinoshita Marc Delcroix Atsunori Ogawa and Tomohiro Nakatani. 2015. Text-informed speech enhancement with deep neural networks. In INTERSPEECH. ISCA 1760--1764.","DOI":"10.21437\/Interspeech.2015-409"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-409"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2017.2726762"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683092"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Bo Li Tara N Sainath Ron J Weiss Kevin W Wilson and Michiel Bacchiani. 2016. Neural network adaptive beamforming for robust multichannel speech recognition. (2016).","DOI":"10.21437\/Interspeech.2016-173"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6801"},{"key":"e_1_3_2_1_33_1","volume-title":"Transfer Feature Learning with Joint Distribution Adaptation. In 2013 IEEE International Conference on Computer Vision. 2200--2207","author":"Long Mingsheng","year":"2013","unstructured":"Mingsheng Long, Jianmin Wang, Guiguang Ding, Jiaguang Sun, and Philip S. Yu. 2013. Transfer Feature Learning with Joint Distribution Adaptation. In 2013 IEEE International Conference on Computer Vision. 2200--2207. https:\/\/doi.org\/10.1109\/ ICCV.2013.274"},{"key":"e_1_3_2_1_34_1","volume-title":"International conference on machine learning. PMLR, 2208--2217","author":"Long Mingsheng","year":"2017","unstructured":"Mingsheng Long, Han Zhu, Jianmin Wang, and Michael I Jordan. 2017. Deep transfer learning with joint adaptation networks. In International conference on machine learning. PMLR, 2208--2217."},{"key":"e_1_3_2_1_35_1","volume-title":"Conv-tasnet: Surpassing ideal time--frequency magnitude masking for speech separation","author":"Luo Yi","year":"2019","unstructured":"Yi Luo and Nima Mesgarani. 2019. Conv-tasnet: Surpassing ideal time--frequency magnitude masking for speech separation. IEEE\/ACM transactions on audio, speech, and language processing 27, 8 (2019), 1256--1266."},{"key":"e_1_3_2_1_36_1","first-page":"3356","article-title":"MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition","volume":"2020","author":"Majumdar Somshubra","year":"2020","unstructured":"Somshubra Majumdar and Boris Ginsburg. 2020. MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition. Proc. Interspeech 2020 (2020), 3356--3360.","journal-title":"Proc. Interspeech"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8268946"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2017.2764276"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_2_1_40_1","volume-title":"SEGAN: Speech enhancement generative adversarial network. arXiv preprint arXiv:1703.09452","author":"Pascual Santiago","year":"2017","unstructured":"Santiago Pascual, Antonio Bonafonte, and Joan Serra. 2017. SEGAN: Speech enhancement generative adversarial network. arXiv preprint arXiv:1703.09452 (2017)."},{"key":"e_1_3_2_1_41_1","unstructured":"Yi Ren Yangjun Ruan Xu Tan Tao Qin Sheng Zhao Zhou Zhao and Tie-Yan Liu. 2019. FastSpeech: Fast Robust and Controllable Text to Speech. In NeurIPS. 3165--3174."},{"key":"e_1_3_2_1_42_1","volume-title":"Streaming keyword spotting on mobile devices. arXiv preprint arXiv:2005.06720","author":"Rybakov Oleg","year":"2020","unstructured":"Oleg Rybakov, Natasha Kononenko, Niranjan Subrahmanya, Mirk\u00f3 Visontai, and Stella Laurenzo. 2020. Streaming keyword spotting on mobile devices. arXiv preprint arXiv:2005.06720 (2020)."},{"key":"e_1_3_2_1_43_1","first-page":"2277","article-title":"Streaming Keyword Spotting on Mobile Devices","volume":"2020","author":"Rybakov Oleg","year":"2020","unstructured":"Oleg Rybakov, Natasha Kononenko, Niranjan Subrahmanya, Mirk\u00f3 Visontai, and Stella Laurenzo. 2020. Streaming Keyword Spotting on Mobile Devices. Proc. Interspeech 2020 (2020), 2277--2281.","journal-title":"Proc. Interspeech"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Tara Sainath and Carolina Parada. 2015. Convolutional neural networks for small-footprint keyword spotting. (2015).","DOI":"10.21437\/Interspeech.2015-352"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2017.2672401"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462211"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA.2019.8937266"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1777"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2006.09.003"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-480"},{"key":"e_1_3_2_1_52_1","volume-title":"Proceedings of the Twenty-Ninth International Conference on International Joint Conferences on Artificial Intelligence. 3816--3822","author":"Tang Chuanxin","year":"2021","unstructured":"Chuanxin Tang, Chong Luo, Zhiyuan Zhao, Wenxuan Xie, and Wenjun Zeng. 2021. Joint time-frequency and time domain learning for speech enhancement. In Proceedings of the Twenty-Ninth International Conference on International Joint Conferences on Artificial Intelligence. 3816--3822."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Naohiro Tawara Tetsunori Kobayashi and Tetsuji Ogawa. 2019. Multi-Channel Speech Enhancement Using Time-Domain Convolutional Denoising Autoen-coder.. In INTERSPEECH. 86--90.","DOI":"10.21437\/Interspeech.2019-3197"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.4806631"},{"key":"e_1_3_2_1_55_1","volume-title":"Deep domain confusion: Maximizing for domain invariance. arXiv preprint arXiv:1412.3474","author":"Tzeng Eric","year":"2014","unstructured":"Eric Tzeng, Judy Hoffman, Ning Zhang, Kate Saenko, and Trevor Darrell. 2014. Deep domain confusion: Maximizing for domain invariance. arXiv preprint arXiv:1412.3474 (2014)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"crossref","unstructured":"Cassia Valentini-Botinhao Xin Wang Shinji Takaki and Junichi Yamagishi. 2016. Investigating RNN-based speech enhancement methods for noise-robust Text-to-Speech.. In SSW. 146--152.","DOI":"10.21437\/SSW.2016-24"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","unstructured":"Cassia Valentini-Botinhao Xin Wang Shinji Takaki and Junichi Yamagishi. 2016. Speech Enhancement for a Noise-Robust Text-to-Speech Synthesis System Using Deep Recurrent Neural Networks. In interspeech. ISCA 352--356. https: \/\/doi.org\/10.21437\/Interspeech.2016-159","DOI":"10.21437\/Interspeech.2016-159"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSDA"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2018.2842159"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3015027"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953242"},{"key":"e_1_3_2_1_62_1","volume-title":"On training targets for supervised speech separation","author":"Wang Yuxuan","year":"2014","unstructured":"Yuxuan Wang, Arun Narayanan, and DeLiang Wang. 2014. On training targets for supervised speech separation. IEEE\/ACM transactions on audio, speech, and language processing 22, 12 (2014), 1849--1858."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-22482-4_11"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2014.2364452"},{"key":"e_1_3_2_1_65_1","volume-title":"Neural spatio-temporal beamformer for target speech separation. arXiv preprint arXiv:2005.03889","author":"Xu Yong","year":"2020","unstructured":"Yong Xu, Meng Yu, Shi-Xiong Zhang, Lianwu Chen, Chao Weng, Jianming Liu, and Dong Yu. 2020. Neural spatio-temporal beamformer for target speech separation. arXiv preprint arXiv:2005.03889 (2020)."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.6137"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6489"},{"key":"e_1_3_2_1_68_1","volume-title":"TridentSE: Guiding Speech Enhancement with 32 Global Tokens. arXiv preprint arXiv:2210.12995","author":"Yin Dacheng","year":"2022","unstructured":"Dacheng Yin, Zhiyuan Zhao, Chuanxin Tang, Zhiwei Xiong, and Chong Luo. 2022. TridentSE: Guiding Speech Enhancement with 32 Global Tokens. arXiv preprint arXiv:2210.12995 (2022)."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO.2017.8081275"},{"key":"e_1_3_2_1_70_1","unstructured":"Yang You Jing Li Sashank J. Reddi Jonathan Hseu Sanjiv Kumar Srinadh Bho-janapalli Xiaodan Song James Demmel Kurt Keutzer and Cho-Jui Hsieh. 2020. Large Batch Optimization for Deep Learning: Training BERT in 76 minutes. In ICLR. OpenReview.net."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746273"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953224"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612173","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612173","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:06:36Z","timestamp":1755821196000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612173"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":72,"alternative-id":["10.1145\/3581783.3612173","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612173","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}