{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,10]],"date-time":"2026-05-10T15:28:19Z","timestamp":1778426899802,"version":"3.51.4"},"publisher-location":"Cham","reference-count":28,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031781032","type":"print"},{"value":"9783031781049","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78104-9_23","type":"book-chapter","created":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T21:43:50Z","timestamp":1733089430000},"page":"335-350","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["ECMISM: Speech Recognition via\u00a0Enhancing Conformer Models with\u00a0Innovative Scoring Matrices"],"prefix":"10.1007","author":[{"given":"Jiang","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Liejun","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Yinfeng","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Miaomiao","family":"Xu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,2]]},"reference":[{"key":"23_CR1","doi-asserted-by":"crossref","unstructured":"Rabiner, L.R.: A tutorial on hidden Markov models and selected applications in speech recognition. In Proceedings of the IEEE 77(2), 257\u2013286 (1989)","DOI":"10.1109\/5.18626"},{"key":"23_CR2","doi-asserted-by":"crossref","unstructured":"Juang, B.H., Rabiner, L.R.: Hidden Markov models for speech recognition. Technometrics 33(3), 251\u2013272 (1991)","DOI":"10.1080\/00401706.1991.10484833"},{"key":"23_CR3","doi-asserted-by":"crossref","unstructured":"X Cui, Y Gong. A study of variable-parameter Gaussian mixture hidden Markov modeling for noisy speech recognition. In IEEE transactions on audio, speech, and language processing(TASLP), 2007, vol. 15, no. 4, pp. 1366-1376","DOI":"10.1109\/TASL.2006.889791"},{"key":"23_CR4","doi-asserted-by":"crossref","unstructured":"A Graves, S Fern\u00e1ndez, F Gomez, et al. Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In Proceedings of the 23rd international conference on Machine learning (ICML\u201906). Association for Computing Machinery, New York, NY, USA, 369-376, 2006","DOI":"10.1145\/1143844.1143891"},{"key":"23_CR5","doi-asserted-by":"crossref","unstructured":"J. Lee and S. Watanabe. Intermediate Loss Regularization for CTC-Based Speech Recognition. In ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Toronto, ON, Canada, 2021, pp. 6224-6228","DOI":"10.1109\/ICASSP39728.2021.9414594"},{"key":"23_CR6","unstructured":"H Liu, Z Zhu, X Li, et al. Gram-CTC: Automatic unit selection and target decomposition for sequence labelling. In Proceedings of the 34th International Conference on Machine Learning, PMLR 70:2188-2197, 2017"},{"key":"23_CR7","unstructured":"D Amodei, S Ananthanarayanan, R Anubhai, et al. Deep speech 2: End-to-end speech recognition in english and mandarin. In International conference on machine learning. PMLR, 2016: 173-182"},{"key":"23_CR8","doi-asserted-by":"crossref","unstructured":"J Jorge, A Gim\u00e9nez, J Iranzo-S\u00e1nchez, et al. LSTM-based one-pass decoder for low-latency streaming. In ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2020: 7814-7818","DOI":"10.1109\/ICASSP40776.2020.9054267"},{"key":"23_CR9","doi-asserted-by":"crossref","unstructured":"Zhao, R., Xue, J., Li, J., Challenges, O.A.P., for RNN-Transducer. In, et al.: IEEE Automatic Speech Recognition and Understanding Workshop (ASRU). Cartagena, Colombia 2021, 526\u2013533 (2021)","DOI":"10.1109\/ASRU51503.2021.9688101"},{"key":"23_CR10","unstructured":"A Vaswani, N Shazeer, N Parmar, et al. Attention is all you need. Advances in neural information processing systems, 2017, 30"},{"key":"23_CR11","doi-asserted-by":"crossref","unstructured":"L Dong, S Xu, B Xu. Speech-transformer: a no-recurrence sequence-to-sequence model for speech recognition. In 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Calgary, AB, Canada, 2018, pp. 5884-5888","DOI":"10.1109\/ICASSP.2018.8462506"},{"key":"23_CR12","doi-asserted-by":"crossref","unstructured":"A Gulati, J Qin, CC Chiu, et al. Conformer: Convolution-augmented transformer for speech recognition. arXiv preprint arXiv:2005.08100, 2020","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"23_CR13","doi-asserted-by":"crossref","unstructured":"Burchi, M., Vielzeuf, V., Efficient conformer: Progressive downsampling and grouped attention for automatic speech recognition. In,: IEEE Automatic Speech Recognition and Understanding Workshop (ASRU). Cartagena, Colombia 2021, 8\u201315 (2021)","DOI":"10.1109\/ASRU51503.2021.9687874"},{"key":"23_CR14","doi-asserted-by":"crossref","unstructured":"A Andrusenko, R Nasretdinov, A Romanenko. Uconv-conformer: High reduction of input sequence length for end-to-end speech recognition. In ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Rhodes Island, Greece, 2023, pp. 1-5","DOI":"10.1109\/ICASSP49357.2023.10095430"},{"key":"23_CR15","doi-asserted-by":"crossref","unstructured":"Chan, W., Jaitly, N., Le, Q., Listen, attend and spell: A neural network for large vocabulary conversational speech recognition. In, et al.: IEEE international conference on acoustics, speech and signal processing (ICASSP). Shanghai, China 2016, 4960\u20134964 (2016)","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"23_CR16","doi-asserted-by":"crossref","unstructured":"C Chen, P Zhang. Cta-rnn: Channel and temporal-wise attention rnn leveraging pre-trained asr embeddings for speech emotion recognition. arXiv preprint arXiv:2203.17023, 2022","DOI":"10.21437\/Interspeech.2022-10403"},{"key":"23_CR17","doi-asserted-by":"crossref","unstructured":"Zhang, X., Zhang, F., Liu, C., Benchmarking lf-mmi, ctc and rnn-t criteria for streaming asr. In, et al.: IEEE spoken language technology workshop (SLT). Shenzhen, China 2021, 46\u201351 (2021)","DOI":"10.1109\/SLT48900.2021.9383623"},{"key":"23_CR18","doi-asserted-by":"crossref","unstructured":"B Zhang, D Wu, Z Peng, et al. Wenet 2.0: More productive end-to-end speech recognition toolkit. arXiv preprint arXiv:2203.15455, 2022","DOI":"10.21437\/Interspeech.2022-483"},{"key":"23_CR19","doi-asserted-by":"crossref","unstructured":"Bu, H., Du, J., Na, X., Aishell-1: An open-source mandarin speech corpus and a speech recognition baseline. In, et al.: 20th conference of the oriental chapter of the international coordinating committee on speech databases and speech I\/O systems and assessment (O-COCOSDA). Seoul. Korea (South) 2017, 1\u20135 (2017)","DOI":"10.1109\/ICSDA.2017.8384449"},{"key":"23_CR20","unstructured":"Mozilla common voice, https:\/\/commonvoice.mozilla.org\/zh-CN\/datasets"},{"key":"23_CR21","doi-asserted-by":"crossref","unstructured":"Z Gao, Z Li, J Wang, et al. Funasr: A fundamental end-to-end speech recognition toolkit. arXiv preprint arXiv:2305.11013, 2023","DOI":"10.21437\/Interspeech.2023-1428"},{"key":"23_CR22","doi-asserted-by":"crossref","unstructured":"K An, X Shi, S Zhang. BAT: Boundary aware transducer for memory-efficient and low-latency ASR. arXiv preprint arXiv:2305.11571, 2023","DOI":"10.21437\/Interspeech.2023-770"},{"key":"23_CR23","doi-asserted-by":"crossref","unstructured":"J Lee, L Lee, S Watanabe. Memory-efficient training of RNN-Transducer with sampled softmax. arXiv preprint arXiv:2203.16868, 2022","DOI":"10.21437\/Interspeech.2022-787"},{"key":"23_CR24","doi-asserted-by":"crossref","unstructured":"Y Chen, W Ding, J Lai. Improving Noisy Student Training on Non-Target Domain Data for Automatic Speech Recognition. In ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Rhodes Island, Greece, 2023, pp. 1-5","DOI":"10.1109\/ICASSP49357.2023.10095704"},{"key":"23_CR25","unstructured":"A Radford, J W Kim, T Xu, et al. Robust speech recognition via large-scale weak supervision. In International Conference on Machine Learning. PMLR, 2023: 28492-28518"},{"key":"23_CR26","doi-asserted-by":"crossref","unstructured":"Xu, M., Zhang, J., Xu, L., et al.: Collaborative Encoding Method for Scene Text Recognition in Low Linguistic Resources: The Uyghur Language Case Study. Appl. Sci. 14, 1707 (2024)","DOI":"10.3390\/app14051707"},{"key":"23_CR27","doi-asserted-by":"crossref","unstructured":"Zhang, J., Wang, L., Yu, Y., et al.: Nonlinear Regularization Decoding Method for Speech Recognition. Sensors 24, 3846 (2024)","DOI":"10.3390\/s24123846"},{"key":"23_CR28","doi-asserted-by":"crossref","unstructured":"G Huang, L Zhuang, L Van Der Maaten, et al. Densely connected convolutional networks. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 4700-4708. 2017","DOI":"10.1109\/CVPR.2017.243"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78104-9_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T23:29:56Z","timestamp":1733095796000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78104-9_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,2]]},"ISBN":["9783031781032","9783031781049"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78104-9_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,2]]},"assertion":[{"value":"2 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}