{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T10:55:38Z","timestamp":1730199338743,"version":"3.28.0"},"reference-count":41,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,12,13]],"date-time":"2021-12-13T00:00:00Z","timestamp":1639353600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,12,13]],"date-time":"2021-12-13T00:00:00Z","timestamp":1639353600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,12,13]]},"DOI":"10.1109\/asru51503.2021.9688009","type":"proceedings-article","created":{"date-parts":[[2022,2,3]],"date-time":"2022-02-03T20:31:00Z","timestamp":1643920260000},"page":"76-82","source":"Crossref","is-referenced-by-count":14,"title":["Improving Hybrid CTC\/Attention End-to-End Speech Recognition with Pretrained Acoustic and Language Models"],"prefix":"10.1109","author":[{"given":"Keqi","family":"Deng","sequence":"first","affiliation":[{"name":"Tencent Cloud Xiaowei,Beijing,China"}]},{"given":"Songjun","family":"Cao","sequence":"additional","affiliation":[{"name":"Tencent Cloud Xiaowei,Beijing,China"}]},{"given":"Yike","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tencent Cloud Xiaowei,Beijing,China"}]},{"given":"Long","family":"Ma","sequence":"additional","affiliation":[{"name":"Tencent Cloud Xiaowei,Beijing,China"}]}],"member":"263","reference":[{"key":"ref39","article-title":"Accurate, large minibatch SGD: training imagenet in 1 hour","volume":"abs 1706 2677","author":"goyal","year":"2017","journal-title":"CoRR"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-3041"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053281"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414641"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414227"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414310"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-4009"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"ref35","article-title":"AISHELL-2: transforming mandarin ASR research into industrial scale","volume":"abs 1808 10583","author":"du","year":"2018","journal-title":"CoRR"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1039"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1186"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1454"},{"journal-title":"Language Models are Unsupervised Multitask Learners","year":"2019","author":"radford","key":"ref12"},{"key":"ref13","first-page":"6543","article-title":"Pretraining transformer decoder for end-to-end asr model with unpaired text data","author":"gao","year":"0","journal-title":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP)"},{"key":"ref14","first-page":"6000","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Proceedings of the 31st International Conference on Neural Information Processing Systems"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1048"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICSDA.2017.8384449"},{"key":"ref19","article-title":"vq-wav2vec: Self-supervised learning of discrete speech representations","author":"baevski","year":"2020","journal-title":"8th International Conference on Learning Representations ICLR 2020"},{"key":"ref28","article-title":"Non-autoregressive transformer-based end-to-end ASR using BERT","volume":"abs 2104 4805","author":"yu","year":"2021","journal-title":"CoRR"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682586"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3066274"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2017.2763455"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2021.3071668"},{"key":"ref29","article-title":"Applying wav2vec2.0 to speech recognition in various low-resource languages","volume":"abs 2012 12121","author":"yi","year":"2020","journal-title":"CoRR"},{"key":"ref5","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"baevski","year":"0","journal-title":"NIPS 2020"},{"key":"ref8","first-page":"4171","article-title":"BERT: pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2019","journal-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics Human Language Technologies NAACL-HLT 2019 Minneapolis MN USA June 2&#x2013;7 2019 Volume 1 (Long and Short Papers)"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.167"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"ref9","first-page":"3465","article-title":"wav2vec: Unsupervised PreTraining for Speech Recognition","author":"schneider","year":"0","journal-title":"Interspeech 2019"},{"key":"ref1","first-page":"1764","article-title":"Towards end-to-end speech recognition with recurrent neural networks","author":"graves","year":"2014","journal-title":"Proceedings of the 31th International Conference on Machine Learning ICML 2014"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1031"},{"key":"ref22","article-title":"Distilbert, a distilled version of BERT: smaller, faster, cheaper and lighter","volume":"abs 1910 1108","author":"sanh","year":"2019","journal-title":"CoRR"},{"key":"ref21","article-title":"Semantics of the unwritten","volume":"abs 2004 2251","author":"bai","year":"2020","journal-title":"CoRR"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462682"},{"key":"ref41","article-title":"The kaldi speech recognition toolkit","author":"povey","year":"2011","journal-title":"2011 IEEE Workshop on Automatic Speech Recognition &amp; Understanding"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414575"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1179"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1392"}],"event":{"name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","start":{"date-parts":[[2021,12,13]]},"location":"Cartagena, Colombia","end":{"date-parts":[[2021,12,17]]}},"container-title":["2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9687821\/9687855\/09688009.pdf?arnumber=9688009","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,16]],"date-time":"2022-05-16T20:41:23Z","timestamp":1652733683000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9688009\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,12,13]]},"references-count":41,"URL":"https:\/\/doi.org\/10.1109\/asru51503.2021.9688009","relation":{},"subject":[],"published":{"date-parts":[[2021,12,13]]}}}