{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:21:27Z","timestamp":1776889287010,"version":"3.51.2"},"reference-count":38,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T00:00:00Z","timestamp":1730937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T00:00:00Z","timestamp":1730937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,11,7]]},"DOI":"10.1109\/iscslp63861.2024.10800077","type":"proceedings-article","created":{"date-parts":[[2024,12,23]],"date-time":"2024-12-23T19:11:17Z","timestamp":1734981077000},"page":"26-30","source":"Crossref","is-referenced-by-count":13,"title":["Unveiling the Potential of LLM-Based ASR on Chinese Open-Source Datasets"],"prefix":"10.1109","author":[{"given":"Xuelong","family":"Geng","sequence":"first","affiliation":[{"name":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU),Xi&#x0027;an"}]},{"given":"Tianyi","family":"Xu","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU),Xi&#x0027;an"}]},{"given":"Kun","family":"Wei","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU),Xi&#x0027;an"}]},{"given":"Bingshen","family":"Mu","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU),Xi&#x0027;an"}]},{"given":"Hongfei","family":"Xue","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU),Xi&#x0027;an"}]},{"given":"He","family":"Wang","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU),Xi&#x0027;an"}]},{"given":"Yangze","family":"Li","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU),Xi&#x0027;an"}]},{"given":"Pengcheng","family":"Guo","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU),Xi&#x0027;an"}]},{"given":"Yuhang","family":"Dai","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU),Xi&#x0027;an"}]},{"given":"Longhao","family":"Li","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU),Xi&#x0027;an"}]},{"given":"Mingchen","family":"Shao","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU),Xi&#x0027;an"}]},{"given":"Lei","family":"Xie","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU),Xi&#x0027;an"}]}],"member":"263","reference":[{"key":"ref1","article-title":"LLaMA: Open and Efficient Foundation Language Models","author":"Touvron","year":"2023","journal-title":"CoRR"},{"key":"ref2","article-title":"BERT: Pretraining of Deep Bidirectional Transformers for Language Under-standing","author":"Devlin","year":"2019","journal-title":"NAACL-HLT"},{"key":"ref3","article-title":"GPT-4 Technical Report","year":"2023","journal-title":"CoRR"},{"key":"ref4","article-title":"Gemini: a family of highly capable multimodal models","author":"Team","year":"2023","journal-title":"CoRR"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.1055"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/icassp49357.2023.10097086"},{"key":"ref7","article-title":"Blank Collapse: Com-pressing CTC emission for the faster decoding","author":"Jung","year":"2022","journal-title":"CoRR"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1209"},{"key":"ref9","article-title":"Private Language Model Adaptation for Speech Recognition","author":"Liu","year":"2021","journal-title":"CoRR"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2018.2888814"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-2225"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682490"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i21.30570"},{"key":"ref14","article-title":"Hugging-GPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face","author":"Shen","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref15","article-title":"Leveraging Large Language Models for Exploiting ASR Uncertainty","author":"Dighe","year":"2023","journal-title":"CoRR"},{"key":"ref16","article-title":"Can Generative Large Language Models Perform ASR Error Correction?","author":"Ma","year":"2023","journal-title":"CoRR"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389705"},{"key":"ref18","article-title":"Listen, Think, and Understand","author":"Gong","year":"2023","journal-title":"CoRR"},{"key":"ref19","article-title":"SALMONN: Towards Generic Hearing Abilities for Large Language Models","author":"Tang","year":"2023","journal-title":"CoRR"},{"key":"ref20","article-title":"Qwen-Audio: Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models","author":"Chu","year":"2023","journal-title":"CoRR"},{"key":"ref21","article-title":"An Embarrassingly Simple Approach for LLM with Strong ASR Capacity","author":"Ma","year":"2024","journal-title":"CoRR"},{"key":"ref22","article-title":"Robust Speech Recognition via Large-Scale Weak Supervision","author":"Radford","year":"2023","journal-title":"ICML"},{"key":"ref23","article-title":"BEATs: Audio Pre-Training with Acoustic Tokenizers","author":"Chen","year":"2023","journal-title":"ICML"},{"key":"ref24","article-title":"Lib-rispeech: An ASR corpus based on public domain audio books","author":"Panayotov","year":"2015","journal-title":"ICASSP"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref26","article-title":"BLIP-2: Bootstrapping Language-Image Pretraining with Frozen Image Encoders and Large Language Models","author":"Li","year":"2023","journal-title":"ICML"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746682"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICSDA.2017.8384449"},{"key":"ref29","article-title":"AISHELL-2: Transforming Mandarin ASR Research Into Industrial Scale","volume-title":"CoRR","author":"Du","year":"2018"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1397"},{"key":"ref31","article-title":"SpeechColab Leader-board: An Open-Source Platform for Automatic Speech Recog-nition Evaluation","author":"Du","year":"2024","journal-title":"CoRR"},{"key":"ref32","article-title":"Attention is All you Need","author":"Vaswani","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref33","article-title":"LoRA: Low-Rank Adaptation of Large Language Models","author":"Hu","year":"2022","journal-title":"ICLR"},{"key":"ref34","article-title":"Decoupled Weight Decay Regular-ization","author":"Loshchilov","year":"2019","journal-title":"ICLR"},{"key":"ref35","article-title":"Why Gradient Clip-ping Accelerates Training: A Theoretical Justification for Adap-tivity","author":"Zhang","year":"2020","journal-title":"ICLR"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-9996"},{"key":"ref37","article-title":"U2++: Unified Two-pass Bidirectional End-to-end Model for Speech Recognition","author":"Wu","year":"2021","journal-title":"CoRR"},{"key":"ref38","article-title":"We Net 2.0: More Productive End-to-End Speech Recognition Toolkit","author":"Zhang","year":"2022","journal-title":"Interspeech"}],"event":{"name":"2024 IEEE 14th International Symposium on Chinese Spoken Language Processing (ISCSLP)","location":"Beijing, China","start":{"date-parts":[[2024,11,7]]},"end":{"date-parts":[[2024,11,10]]}},"container-title":["2024 IEEE 14th International Symposium on Chinese Spoken Language Processing (ISCSLP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10799944\/10799969\/10800077.pdf?arnumber=10800077","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,15]],"date-time":"2025-01-15T19:28:38Z","timestamp":1736969318000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10800077\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,7]]},"references-count":38,"URL":"https:\/\/doi.org\/10.1109\/iscslp63861.2024.10800077","relation":{},"subject":[],"published":{"date-parts":[[2024,11,7]]}}}