{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,15]],"date-time":"2026-04-15T17:53:29Z","timestamp":1776275609876,"version":"3.50.1"},"reference-count":53,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,12,13]],"date-time":"2021-12-13T00:00:00Z","timestamp":1639353600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,12,13]],"date-time":"2021-12-13T00:00:00Z","timestamp":1639353600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,12,13]]},"DOI":"10.1109\/asru51503.2021.9688053","type":"proceedings-article","created":{"date-parts":[[2022,2,3]],"date-time":"2022-02-03T20:31:00Z","timestamp":1643920260000},"page":"1019-1025","source":"Crossref","is-referenced-by-count":12,"title":["Decoupling Recognition and Transcription in Mandarin ASR"],"prefix":"10.1109","author":[{"given":"Jiahong","family":"Yuan","sequence":"first","affiliation":[{"name":"Baidu Research USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xingyu","family":"Cai","sequence":"additional","affiliation":[{"name":"Baidu Research USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dongji","family":"Gao","sequence":"additional","affiliation":[{"name":"Johns Hopkins University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Renjie","family":"Zheng","sequence":"additional","affiliation":[{"name":"Baidu Research USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Liang","family":"Huang","sequence":"additional","affiliation":[{"name":"Baidu Research USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kenneth","family":"Church","sequence":"additional","affiliation":[{"name":"Baidu Research USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref39","author":"povey","year":"0","journal-title":"2011 IEEE Workshop on Automatic Speech Recognition &amp; Understanding"},{"key":"ref38","article-title":"Aishell-1: An open-source mandarin speech corpus and a speech recognition baseline","author":"hui","year":"0","journal-title":"Proceedings of O-COCOSDA"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref32","article-title":"Conformer: Convolution-augmented transformer for speech recognition","author":"anmol","year":"0","journal-title":"Proceedings of Interspeech"},{"key":"ref31","first-page":"5998","article-title":"Attention is all you need","author":"ashish","year":"0","journal-title":"Advances in neural information processing systems"},{"key":"ref30","article-title":"Fastcorrect: Fast error correction with edit alignment for automatic speech recognition","author":"leng","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref37","article-title":"Automatic recognition of suprasegmentals in speech","author":"yuan","year":"0","journal-title":"ASRU"},{"key":"ref36","article-title":"Speech emotion recognition with multi-task learning","author":"cai","year":"0","journal-title":"Proceedings of Interspeech 2021"},{"key":"ref35","article-title":"Applying wav2vec2.0 to speech recognition in various low-resource languages","author":"yi","year":"2020","journal-title":"ArXiv"},{"key":"ref34","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"baevski","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref28","article-title":"Efficient conformer-based speech recognition with linear attention","author":"li","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref27","article-title":"Improved conformer-based end-to-end speech recognition using neural architecture search","author":"liu","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414103"},{"key":"ref2","article-title":"Listen attentively, and spell once: Whole sentence generation via a non-autoregressive architecture for low-latency speech recognition","author":"bail","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref1","article-title":"Multi-head monotonic chunkwise attention for online speech recognition","author":"liu","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1186\/s13636-021-00215-6"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-911"},{"key":"ref21","article-title":"Citrinet: Closing the gap between non-autoregressive and autoregressive end-to-end models for automatic speech recognition","author":"majumdar","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref24","article-title":"Wnars: Wfst based non-autoregressive streaming end-to-end speech recognition","author":"wang","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref23","article-title":"Darts-conformer: Towards efficient gradient-based neural architecture search for end-to-end asr","author":"shi","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref26","article-title":"Non-autoregressive transformer-based end-to-end asr using bert","author":"yu","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688238"},{"key":"ref50","article-title":"Research on modeling units of transformer transducer for mandarin speech recognition","author":"fu","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref51","article-title":"Decoupling pronunciation and language for end-to-end code-switching automatic speech recognition","author":"zhang","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383506"},{"key":"ref52","first-page":"7059","article-title":"Independent language modeling architecture for end-to-end asr","author":"van pham","year":"0","journal-title":"ICASSP 2020 &#x2014; 2020 IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP)"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414694"},{"key":"ref11","article-title":"Improving rnn transducer with normalized jointer network","author":"huang","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref40","article-title":"New methods in continuous mandarin speech recognition","author":"chen","year":"0","journal-title":"Eurospeech"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-280"},{"key":"ref13","article-title":"Gated recurrent fusion with joint training framework for robust end-to-end speech recognition","author":"fan","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref14","article-title":"Multi-quartznet: Multi-resolution convolution for speech recognition with multi-layer feature fusion","author":"luo","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref15","article-title":"Transformer-based online speech recognition with decoder-end adaptive computation steps","author":"li","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref16","article-title":"Unified streaming and non-streaming two-pass end-to-end model for speech recognition","author":"zhang","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1983"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414594"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-427"},{"key":"ref4","article-title":"A further study of unsupervised pre-training for transformer based speech recognition","author":"jiang","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2086"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2556"},{"key":"ref5","article-title":"Simplified self-attention for transformer-based end-to-end speech recognition","author":"luo","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref8","article-title":"Recent developments on espnet toolkit","author":"guo","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2677"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683859"},{"key":"ref9","article-title":"One in a hundred: Select the best predicted sequence from numerous candidates for streaming speech recognition","author":"tian","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2014.07.087"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1365"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-334"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-04221-9_19"},{"key":"ref42","article-title":"Large vocabulary mandarin speech recognition with different approaches in modeling tones","author":"chang","year":"0","journal-title":"InterSpeech"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2009.2014263"},{"key":"ref44","first-page":"5621","article-title":"Bytes are all you need: End-to-end multilingual speech recognition and synthesis with bytes","author":"bo","year":"0","journal-title":"ICASSP 2019 &#x2014; 2019 IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP)"},{"key":"ref43","author":"lei","year":"2006","journal-title":"Modeling Lexical Tones for Mandarin Large Vocabulary Continuous Speech Recognition"}],"event":{"name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Cartagena, Colombia","start":{"date-parts":[[2021,12,13]]},"end":{"date-parts":[[2021,12,17]]}},"container-title":["2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9687821\/9687855\/09688053.pdf?arnumber=9688053","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,16]],"date-time":"2022-05-16T20:41:29Z","timestamp":1652733689000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9688053\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,12,13]]},"references-count":53,"URL":"https:\/\/doi.org\/10.1109\/asru51503.2021.9688053","relation":{},"subject":[],"published":{"date-parts":[[2021,12,13]]}}}