{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T14:40:48Z","timestamp":1742913648374,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":36,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819794362"},{"type":"electronic","value":"9789819794379"}],"license":[{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-97-9437-9_12","type":"book-chapter","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T16:28:43Z","timestamp":1730392123000},"page":"145-157","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["AugMixSpeech: A Data Augmentation Method and\u00a0Consistency Regularization for\u00a0Mandarin Automatic Speech Recognition"],"prefix":"10.1007","author":[{"given":"Yang","family":"Jiang","sequence":"first","affiliation":[]},{"given":"Jun","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Kai","family":"Han","sequence":"additional","affiliation":[]},{"given":"Yi","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Siqi","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Yuqing","family":"Song","sequence":"additional","affiliation":[]},{"given":"Zhe","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,1]]},"reference":[{"key":"12_CR1","doi-asserted-by":"crossref","unstructured":"Kheddar, H., Hemis, M., Himeur, Y.: Automatic speech recognition using advanced deep learning approaches: a survey. Inf. Fusion 102422 (2024)","DOI":"10.1016\/j.inffus.2024.102422"},{"key":"12_CR2","doi-asserted-by":"crossref","unstructured":"Prabhavalkar, R., Hori, T., Sainath, T.N., Schl\u00fcter, R., Watanabe, S.: End-to-end speech recognition: a survey. IEEE\/ACM Trans. Audio Speech Lang. Process. (2023)","DOI":"10.1109\/TASLP.2023.3328283"},{"key":"12_CR3","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. Adv. Neural Inf. Process. Syst. 33, 12449\u201312460 (2020)"},{"issue":"7","key":"12_CR4","doi-asserted-by":"publisher","first-page":"5674","DOI":"10.1007\/s10489-024-05381-6","volume":"54","author":"K Ding","year":"2024","unstructured":"Ding, K., Li, R., Xu, Y., Du, X., Deng, B.: Adaptive data augmentation for mandarin automatic speech recognition. Appl. Intell. 54(7), 5674\u20135687 (2024)","journal-title":"Appl. Intell."},{"key":"12_CR5","doi-asserted-by":"crossref","unstructured":"Ko, T., Peddinti, V., Povey, D., Khudanpur, S.: Audio augmentation for speech recognition. In: Interspeech, vol.\u00a02015, p.\u00a03586 (2015)","DOI":"10.21437\/Interspeech.2015-711"},{"key":"12_CR6","doi-asserted-by":"crossref","unstructured":"Wang, Y., Getreuer, P., Hughes, T., Lyon, R.F., Saurous, R.A.: Trainable frontend for robust and far-field keyword spotting. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5670\u20135674 (2017)","DOI":"10.1109\/ICASSP.2017.7953242"},{"key":"12_CR7","doi-asserted-by":"crossref","unstructured":"Park, D.S., et al.: Specaugment: a simple data augmentation method for automatic speech recognition. arXiv preprint arXiv:1904.08779 (2019)","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"12_CR8","unstructured":"Wu, D., et al.: U2++: unified two-pass bidirectional end-to-end model for speech recognition. arXiv preprint arXiv:2106.05642 (2021)"},{"key":"12_CR9","unstructured":"Zhang, H., Cisse, M., Dauphin, Y.N., Lopez-Paz, D.: mixup: beyond empirical risk minimization. In: International Conference on Learning Representations (2018)"},{"key":"12_CR10","doi-asserted-by":"crossref","unstructured":"Meng, L., Xu, J., Tan, X., Wang, J., Qin, T., Xu, B.: Mixspeech: data augmentation for low-resource automatic speech recognition. In: ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 7008\u20137012. IEEE (2021)","DOI":"10.1109\/ICASSP39728.2021.9414483"},{"key":"12_CR11","doi-asserted-by":"crossref","unstructured":"Yun, S., Han, D., Oh, S.J., Chun, S., Choe, J., Yoo, Y.: Cutmix: regularization strategy to train strong classifiers with localizable features. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00612"},{"key":"12_CR12","unstructured":"Hendrycks, D., Mu, N., Cubuk, E.D., Zoph, B., Gilmer, J., Lakshminarayanan, B.: Augmix: a simple method to improve robustness and uncertainty under data shift. In: International Conference on Learning Representations, vol.\u00a01, p.\u00a05 (2020)"},{"key":"12_CR13","unstructured":"Kim, J., Choo, W., Jeong, H., Song, H.O.: Co-mixup: saliency guided joint mixup with supermodular diversity. arxiv:2102.03065 (2021)"},{"key":"12_CR14","doi-asserted-by":"crossref","unstructured":"Ng, D., et al.: Contrastive speech mixup for low-resource keyword spotting. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10096976"},{"key":"12_CR15","doi-asserted-by":"crossref","unstructured":"Qiu, S.: Construction of English speech recognition model by fusing cnn and random deep factorization tdnn. ACM Trans. Asian Low-Res. Lang. Inf. Process. (2023)","DOI":"10.1145\/3597456"},{"key":"12_CR16","doi-asserted-by":"crossref","unstructured":"Zhang, N., Wang, J., Wei, W., Qu, X., Cheng, N., Xiao, J.: Cacnet: cube attentional cnn for automatic speech recognition. In: 2021 International Joint Conference on Neural Networks (IJCNN), pp.\u00a01\u20137. IEEE (2021)","DOI":"10.1109\/IJCNN52387.2021.9533666"},{"key":"12_CR17","doi-asserted-by":"publisher","first-page":"30069","DOI":"10.1109\/ACCESS.2022.3159339","volume":"10","author":"J Oruh","year":"2022","unstructured":"Oruh, J., Viriri, S., Adegun, A.: Long short-term memory recurrent neural network for automatic speech recognition. IEEE Access 10, 30069\u201330079 (2022)","journal-title":"IEEE Access"},{"key":"12_CR18","doi-asserted-by":"crossref","unstructured":"Fang, Y., Li, X.: Unimodal aggregation for ctc-based speech recognition. In: ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 10591\u201310595 (2024)","DOI":"10.1109\/ICASSP48485.2024.10448248"},{"key":"12_CR19","doi-asserted-by":"crossref","unstructured":"Lei, Z., et al.: Personalization of ctc-based end-to-end speech recognition using pronunciation-driven subword tokenization. In: ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 10096\u201310100 (2024)","DOI":"10.1109\/ICASSP48485.2024.10447571"},{"key":"12_CR20","doi-asserted-by":"crossref","unstructured":"Gong, X., Wang, W., Shao, H., Chen, X., Qian, Y.: Factorized aed: factorized attention-based encoder-decoder for text-only domain adaptive asr. In: ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135 (2023)","DOI":"10.1109\/ICASSP49357.2023.10095937"},{"key":"12_CR21","doi-asserted-by":"publisher","first-page":"1436","DOI":"10.1109\/TASLP.2023.3263789","volume":"31","author":"R Fan","year":"2023","unstructured":"Fan, R., Chu, W., Chang, P., Alwan, A.: A ctc alignment-based non-autoregressive transformer for end-to-end automatic speech recognition. IEEE\/ACM Trans. Audio Speech Lang. Process. 31, 1436\u20131448 (2023)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"12_CR22","doi-asserted-by":"publisher","first-page":"1050","DOI":"10.1109\/TASLP.2023.3245407","volume":"31","author":"B Lyu","year":"2023","unstructured":"Lyu, B., Fan, C., Ming, Y., Zhao, P., Hu, N.: En-hacn: enhancing hybrid architecture with fast attention and capsule network for end-to-end speech recognition. IEEE\/ACM Trans. Audio Speech Lang. Process. 31, 1050\u20131062 (2023)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"12_CR23","doi-asserted-by":"crossref","unstructured":"Chang, F.J., Radfar, M., Mouchtaris, A., King, B., Kunzmann, S.: End-to-end multi-channel transformer for speech recognition. In: ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5884\u20135888 (2021)","DOI":"10.1109\/ICASSP39728.2021.9414123"},{"key":"12_CR24","unstructured":"Anmol, G., et al.: Conformer: convolution-augmented transformer for speech recognition. In: Conference of the International Speech Communication Association, pp. 5036\u20135040 (2020)"},{"key":"12_CR25","doi-asserted-by":"crossref","unstructured":"Burchi, M., Vielzeuf, V.: Efficient conformer: progressive downsampling and grouped attention for automatic speech recognition. In: 2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp. 8\u201315 (2021)","DOI":"10.1109\/ASRU51503.2021.9687874"},{"key":"12_CR26","unstructured":"Kim, S., et al.: Squeezeformer: an efficient transformer for automatic speech recognition. In: Koyejo, S., Mohamed, S., Agarwal, A., Belgrave, D., Cho, K., Oh, A. (eds.) Advances in Neural Information Processing Systems, vol.\u00a035, pp. 9361\u20139373. Curran Associates, Inc. (2022)"},{"issue":"6","key":"12_CR27","doi-asserted-by":"publisher","first-page":"1263","DOI":"10.1109\/JSTSP.2022.3196562","volume":"16","author":"WH Kang","year":"2022","unstructured":"Kang, W.H., Alam, J., Fathan, A.: L-mix: a latent-level instance mixup regularization for robust self-supervised speaker representation learning. IEEE J. Sel. Topics Signal Process. 16(6), 1263\u20131272 (2022)","journal-title":"IEEE J. Sel. Topics Signal Process."},{"issue":"1","key":"12_CR28","first-page":"1","volume":"1","author":"DH Johnson","year":"2001","unstructured":"Johnson, D.H., Sinanovic, S., et al.: Symmetrizing the kullback-leibler distance. IEEE Trans. Inf. Theory 1(1), 1\u201310 (2001)","journal-title":"IEEE Trans. Inf. Theory"},{"key":"12_CR29","doi-asserted-by":"crossref","unstructured":"Bu, H., Du, J., Na, X., Wu, B., Zheng, H.: Aishell-1: an open-source mandarin speech corpus and a speech recognition baseline. In: 2017 20th Conference of the Oriental Chapter of the International Coordinating Committee on Speech Databases and Speech I\/O Systems and Assessment (O-COCOSDA), pp.\u00a01\u20135. IEEE (2017)","DOI":"10.1109\/ICSDA.2017.8384449"},{"key":"12_CR30","doi-asserted-by":"crossref","unstructured":"Gao, Z., Zhang, S., McLoughlin, I., Yan, Z.: Paraformer: fast and accurate parallel transformer for non-autoregressive end-to-end speech recognition. In: Conference of the International Speech Communication Association, pp. 2063\u20132067 (2022)","DOI":"10.21437\/Interspeech.2022-9996"},{"key":"12_CR31","doi-asserted-by":"crossref","unstructured":"Lai, Z.H., et al.: InterFormer: interactive local and global features fusion for automatic speech recognition. In: Proceedings of INTERSPEECH 2023, pp. 566\u2013570 (2023)","DOI":"10.21437\/Interspeech.2023-509"},{"key":"12_CR32","doi-asserted-by":"crossref","unstructured":"Liang, C., et al.: Fast-u2++: fast and accurate end-to-end speech recognition in joint ctc\/attention frames. In: ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135 (2023)","DOI":"10.1109\/ICASSP49357.2023.10096154"},{"key":"12_CR33","doi-asserted-by":"crossref","unstructured":"Wang, J., Liang, Z., Zhang, X., Cheng, N., Xiao, J.: Efficientasr: speech recognition network compression via attention redundancy and chunk-level ffn optimization. arXiv preprint arXiv:2404.19214 (2024)","DOI":"10.1109\/IJCNN60899.2024.10651310"},{"key":"12_CR34","doi-asserted-by":"publisher","first-page":"471","DOI":"10.1109\/LSP.2024.3358754","volume":"31","author":"J Li","year":"2024","unstructured":"Li, J., Duan, Z., Li, S., Yu, X., Yang, G.: Esaformer: enhanced self-attention for automatic speech recognition. IEEE Signal Process. Lett. 31, 471\u2013475 (2024)","journal-title":"IEEE Signal Process. Lett."},{"issue":"1\/2","key":"12_CR35","doi-asserted-by":"publisher","first-page":"59","DOI":"10.17743\/jaes.2022.0112","volume":"72","author":"G Gao","year":"2024","unstructured":"Gao, G., et al.: Information extraction and noisy feature pruning for mandarin speech recognition. J. Audio Eng. Soc. 72(1\/2), 59\u201370 (2024)","journal-title":"J. Audio Eng. Soc."},{"key":"12_CR36","doi-asserted-by":"publisher","first-page":"421","DOI":"10.1109\/LSP.2024.3352489","volume":"31","author":"F Wang","year":"2024","unstructured":"Wang, F., Xu, B., Xu, B.: Sscformer: push the limit of chunk-wise conformer for streaming asr using sequentially sampled chunks and chunked causal convolution. IEEE Signal Process. Lett. 31, 421\u2013425 (2024)","journal-title":"IEEE Signal Process. Lett."}],"container-title":["Lecture Notes in Computer Science","Natural Language Processing and Chinese Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-9437-9_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T16:29:12Z","timestamp":1730392152000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-9437-9_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,1]]},"ISBN":["9789819794362","9789819794379"],"references-count":36,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-9437-9_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,1]]},"assertion":[{"value":"1 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"NLPCC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"CCF International Conference on Natural Language Processing and Chinese Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hangzhou","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 November 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 November 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"nlpcc2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/tcci.ccf.org.cn\/conference\/2024\/index.php","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}