{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,23]],"date-time":"2025-12-23T05:26:41Z","timestamp":1766467601752,"version":"3.41.0"},"reference-count":40,"publisher":"Association for Computing Machinery (ACM)","issue":"11","license":[{"start":{"date-parts":[[2021,10,25]],"date-time":"2021-10-25T00:00:00Z","timestamp":1635120000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nd\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":["Commun. ACM"],"published-print":{"date-parts":[[2021,11]]},"abstract":"<jats:p>Several companies are trying push automatic speech recognition and other technologies past their current limitations.<\/jats:p>","DOI":"10.1145\/3481625","type":"journal-article","created":{"date-parts":[[2021,10,25]],"date-time":"2021-10-25T15:13:37Z","timestamp":1635174817000},"page":"81-87","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["The practice of speech and language processing in China"],"prefix":"10.1145","volume":"64","author":[{"given":"Jia","family":"Jia","sequence":"first","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"given":"Wei","family":"Chen","sequence":"additional","affiliation":[{"name":"Sogou Corporation, Beijing, China"}]},{"given":"Kai","family":"Yu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"given":"Xiaodong","family":"He","sequence":"additional","affiliation":[{"name":"JD AI Research, Beijing, China"}]},{"given":"Jun","family":"Du","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"given":"Heung-Yeung","family":"Shum","sequence":"additional","affiliation":[{"name":"XiaoBing.ai, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2021,10,25]]},"reference":[{"key":"e_1_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-656"},{"key":"e_1_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2226"},{"volume-title":"Proceedings of the IEEE 2020 Intern. Conf. Acoustics, Speech and Signal Processing, 6574--6578","author":"Chen Z.","key":"e_1_2_1_3_1","unstructured":"Chen, Z., Wang, S., Qian, Y., and Yu, K. Channel invariant speaker embedding learning with joint multi-task and adversarial training. In Proceedings of the IEEE 2020 Intern. Conf. Acoustics, Speech and Signal Processing, 6574--6578."},{"volume-title":"Proceedings of the 2017 IEEE Conf. Computer Vision and Pattern Recognition, 3444--3453","author":"Chung J.","key":"e_1_2_1_4_1","unstructured":"Chung, J., Senior, A., Vinyals, O., and Zisserman, A. Lip reading sentences in the wild. In Proceedings of the 2017 IEEE Conf. Computer Vision and Pattern Recognition, 3444--3453."},{"key":"e_1_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2016.2558822"},{"key":"e_1_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2218"},{"key":"e_1_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2471"},{"key":"e_1_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2404"},{"key":"e_1_2_1_9_1","volume-title":"et al. DCCRN: Deep complex convolution recurrent network for phase-aware speech enhancement. (2020)","author":"Hu Y.","year":"2008","unstructured":"Hu, Y. et al. DCCRN: Deep complex convolution recurrent network for phase-aware speech enhancement. (2020); arXiv:2008.00264."},{"key":"e_1_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Kothapally V. Xia W. Ghorbani S. Hansen J. Xue W. and Huang J. SkipConvNet: Skip convolutional neural network for speech dereverberation using optimally smoothed spectral mapping. (2020) arXiv:2007.09131.","DOI":"10.21437\/Interspeech.2020-2048"},{"volume-title":"Proceedings of 2021 IEEE Intern. Conf. Acoustics, Speech and Signal Processing.","author":"Li J.","key":"e_1_2_1_11_1","unstructured":"Li, J. et al. Densely connected multi-stage model with channel wise sub-band feature for real-time speech enhancement. In Proceedings of 2021 IEEE Intern. Conf. Acoustics, Speech and Signal Processing."},{"volume-title":"Proceedings of 2020 Joint Workshop for the Blizzard Challenge and Voice Conversion Challenge, 49--53","author":"Meng F.","key":"e_1_2_1_12_1","unstructured":"Meng, F. et al. The Sogou system for Blizzard Challenge. In Proceedings of 2020 Joint Workshop for the Blizzard Challenge and Voice Conversion Challenge, 49--53."},{"key":"e_1_2_1_13_1","volume-title":"Sound event localization and detection task. 2020 DCASE Challenge","author":"Politis A.","year":"2020","unstructured":"Politis, A., Adavanne, S., and Virtanen, T. Sound event localization and detection task. 2020 DCASE Challenge; http:\/\/dcase.community\/challenge2020\/task-sound-event-localization-anddetection-results"},{"key":"e_1_2_1_14_1","volume-title":"FastSpeech: Fast, robust, and controllable text to speech. NIPS","author":"Ren Y.","year":"2019","unstructured":"Ren, Y., Ruan, Y., Tan, X., Qin, T., Zhao, S., Zhao, Z., and Liu, T. FastSpeech: Fast, robust, and controllable text to speech. NIPS (2019), 3165--3174."},{"key":"e_1_2_1_15_1","unstructured":"Ryant N. Church K. Cieri C. Du J. Ganapathy S. and Liberman M. The third DIHARD Speech Diarization Challenge; https:\/\/sat.nist.gov\/dihard3#tab_leaderboard"},{"key":"e_1_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1057"},{"key":"e_1_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1631\/FITEE.1700826"},{"key":"e_1_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2172"},{"volume-title":"Proceedings of 2021 IEEE Intern. Conf. Acoustics, Speech and Signal Processing.","author":"Song W.","key":"e_1_2_1_19_1","unstructured":"Song, W., Yuan, X., Zhang, Z., Zhang, C., Wu, Y., He, X., and Zhou, B. Dian: Duration informed auto-regressive network for voice cloning. In Proceedings of 2021 IEEE Intern. Conf. Acoustics, Speech and Signal Processing."},{"key":"e_1_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2019.2920764"},{"key":"e_1_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2018.2825432"},{"key":"e_1_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3062"},{"key":"e_1_2_1_23_1","unstructured":"Vaswani A. et al. Attention is all you need. (2017); arXiv:1706.03762."},{"key":"e_1_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2928128"},{"volume-title":"Proceedings of 2018 IEEE Intern. Conf. Acoustics, Speech and Signal Processing, 5339--5343","author":"Wang S.","key":"e_1_2_1_25_1","unstructured":"Wang, S., Qian, Y., and Yu, K. Focal KL-divergence based dilated convolutional neural networks for co-channel speaker identification. In Proceedings of 2018 IEEE Intern. Conf. Acoustics, Speech and Signal Processing, 5339--5343."},{"key":"e_1_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3016498"},{"volume-title":"The 6th CHiME Speech Separation and Recognition Challenge (2020)","author":"Watanabe S.","key":"e_1_2_1_27_1","unstructured":"Watanabe, S., Mandel, M., Barker, J., and Vincent, E. The 6th CHiME Speech Separation and Recognition Challenge (2020); https:\/\/chimechallenge.github.io\/chime6\/results.html"},{"volume-title":"Proceedings of 2021 IEEE Intern. Conf. Acoustics, Speech and Signal Processing.","author":"Xu G.","key":"e_1_2_1_28_1","unstructured":"Xu, G., Song, W., Zhang, Z., Zhang, C., He, X., and Zhou, B. Improving prosody modelling with cross-utterance BERT embeddings for end-to-end speech synthesis. In Proceedings of 2021 IEEE Intern. Conf. Acoustics, Speech and Signal Processing."},{"key":"e_1_2_1_29_1","volume-title":"A regression approach to speech enhancement based on deep neural networks","author":"Xu Y.","year":"2014","unstructured":"Xu, Y., Du, J., Dai, L-R., and Lee, C-H. A regression approach to speech enhancement based on deep neural networks. IEEE\/ACM Trans. on Audio, Speech, and Language Processing 23, 1 (2014), 7--19."},{"volume-title":"Proceedings of Interspeech 2019 2060--2064","author":"Xue L.","key":"e_1_2_1_30_1","unstructured":"Xue, L., Song, W., Xu, G., Xie, L., and Wu, Z. Building a mixed-lingual neural TTS system with only monolingual data. In Proceedings of Interspeech 2019 2060--2064."},{"key":"e_1_2_1_31_1","volume-title":"Neural kalman filtering for speech enhancement. 2020","author":"Xue W.","year":"2007","unstructured":"Xue, W., Quan, G. Zhang, C., Ding, G., He, X., and Zhou, B. Neural kalman filtering for speech enhancement. 2020; arXiv:2007.13962."},{"key":"e_1_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2759"},{"volume-title":"Proceedings of the 2021 IEEE Spoken Language Technology Workshop, 492--498","author":"Yang G.","key":"e_1_2_1_33_1","unstructured":"Yang, G., Yang, S., Liu, K., Fang, P., Chen, W., and Xie, L. Multiband MelGAN: Faster waveform generation for high-quality text-to-speech. In Proceedings of the 2021 IEEE Spoken Language Technology Workshop, 492--498."},{"volume-title":"Proceedings of the 2020 IEEE Intern. Conf. Acoustics, Speech and Signal Processing, 6454--6458","author":"Yang Y.","key":"e_1_2_1_34_1","unstructured":"Yang, Y., Wang, S., Gong, X., Qian, Y., and Yu, K. Text adaptation for speaker verification with speaker-text factorized embeddings. In Proceedings of the 2020 IEEE Intern. Conf. Acoustics, Speech and Signal Processing, 6454--6458."},{"key":"e_1_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3414005"},{"key":"e_1_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3015659"},{"volume-title":"Proceedings of 2020 CCF International Conf. Natural Language Processing and Chinese Computing. Springer, 359--371","author":"Zhao Z.","key":"e_1_2_1_37_1","unstructured":"Zhao, Z., Liu, Y., Chen, L., Liu, Q., Ma, R., and Yu, K. An investigation on different underlying quantization schemes for pre-trained language models. In Proceedings of 2020 CCF International Conf. Natural Language Processing and Chinese Computing. Springer, 359--371."},{"key":"e_1_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1162\/coli_a_00368"},{"key":"e_1_2_1_39_1","volume-title":"Improving generalization of transformer for speech recognition with parallel schedule sampling and relative positional embedding. (2019)","author":"Zhou P.","year":"1911","unstructured":"Zhou, P., Fan, R., Chen, W., and Jia, J. Improving generalization of transformer for speech recognition with parallel schedule sampling and relative positional embedding. (2019); arXiv:1911.00203."},{"volume-title":"Proceedings of the 2019 IEEE Intern. Conf. on Acoustics, Speech and Signal Processing, 6565--6569","author":"Zhou P.","key":"e_1_2_1_40_1","unstructured":"Zhou, P., Yang, W., Chen, W., Wang, Y., and Jia, J. Modality attention for end-to-end audio-visual speech recognition. In Proceedings of the 2019 IEEE Intern. Conf. on Acoustics, Speech and Signal Processing, 6565--6569."}],"container-title":["Communications of the ACM"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3481625","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3481625","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:12Z","timestamp":1750193292000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3481625"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,25]]},"references-count":40,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2021,11]]}},"alternative-id":["10.1145\/3481625"],"URL":"https:\/\/doi.org\/10.1145\/3481625","relation":{},"ISSN":["0001-0782","1557-7317"],"issn-type":[{"type":"print","value":"0001-0782"},{"type":"electronic","value":"1557-7317"}],"subject":[],"published":{"date-parts":[[2021,10,25]]},"assertion":[{"value":"2021-10-25","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}