{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,3]],"date-time":"2025-12-03T17:57:20Z","timestamp":1764784640835,"version":"3.37.3"},"reference-count":37,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100013290","name":"National Key Research and Development Program of China Stem Cell and Translational Research","doi-asserted-by":"publisher","award":["2017YFB1002102"],"award-info":[{"award-number":["2017YFB1002102"]}],"id":[{"id":"10.13039\/501100013290","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Signal Process. Lett."],"published-print":{"date-parts":[[2021]]},"DOI":"10.1109\/lsp.2021.3071668","type":"journal-article","created":{"date-parts":[[2021,4,7]],"date-time":"2021-04-07T19:41:09Z","timestamp":1617824469000},"page":"788-792","source":"Crossref","is-referenced-by-count":30,"title":["Efficiently Fusing Pretrained Acoustic and Linguistic Encoders for Low-Resource Speech Recognition"],"prefix":"10.1109","volume":"28","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5851-1167","authenticated-orcid":false,"given":"Cheng","family":"Yi","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6889-0316","authenticated-orcid":false,"given":"Shiyu","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bo","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"doi-asserted-by":"publisher","key":"ref33","DOI":"10.1007\/11939993_73"},{"doi-asserted-by":"publisher","key":"ref32","DOI":"10.1109\/ICASSP.2017.7953075"},{"key":"ref31","first-page":"5753","article-title":"Generalized autoregressive pretraining for language understanding","author":"yang","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref30","article-title":"Albert: A lite bert for self-supervised learning of language representations","author":"lan","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"doi-asserted-by":"publisher","key":"ref37","DOI":"10.1007\/978-3-030-04221-9_19"},{"key":"ref36","first-page":"1764","article-title":"Towards end-to-end speech recognition with recurrent neural networks","author":"graves","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"year":"2020","author":"yi","article-title":"Applying Wav2vec2.0 to speech recognition in various low-resource languages","key":"ref35"},{"doi-asserted-by":"publisher","key":"ref34","DOI":"10.18653\/v1\/N19-4009"},{"year":"2020","author":"tran","article-title":"Cross-modal transfer learning for multilingual speech-to-text translation","key":"ref10"},{"year":"2020","author":"dong","article-title":"A comparison of label-synchronous and frame-synchronous end-to-end models for speech recognition","key":"ref11"},{"key":"ref12","first-page":"2447","article-title":"A better way to pretrain deep Boltzmann machines","author":"hinton","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"doi-asserted-by":"publisher","key":"ref13","DOI":"10.18653\/v1\/N18-1143"},{"year":"2019","author":"jiang","article-title":"Improving transformer-based speech recognition using unsupervised pre-training","key":"ref14"},{"key":"ref15","article-title":"Non-autoregressive neural machine translation","author":"gu","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"doi-asserted-by":"publisher","key":"ref16","DOI":"10.1109\/SLT.2018.8639038"},{"doi-asserted-by":"publisher","key":"ref17","DOI":"10.21437\/Interspeech.2018-1392"},{"doi-asserted-by":"publisher","key":"ref18","DOI":"10.1109\/ICASSP.2019.8682490"},{"doi-asserted-by":"publisher","key":"ref19","DOI":"10.1162\/tacl_a_00343"},{"key":"ref28","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref4","first-page":"12449","article-title":"Wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"baevski","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"doi-asserted-by":"publisher","key":"ref27","DOI":"10.1145\/1143844.1143891"},{"key":"ref3","first-page":"12449","article-title":"Vq-wav2vec: Self-supervised learning of discrete speech representations","author":"baevski","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"doi-asserted-by":"publisher","key":"ref6","DOI":"10.1109\/ICASSP.2019.8683535"},{"key":"ref29","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","volume":"1","author":"devlin","year":"0","journal-title":"Proc NAACL-HLT"},{"year":"2020","author":"conneau","article-title":"Unsupervised cross-lingual representation learning for speech recognition","key":"ref5"},{"year":"2018","author":"zhou","article-title":"Multilingual end-to-end speech recognition with a single transformer on low-resource languages","key":"ref8"},{"doi-asserted-by":"publisher","key":"ref7","DOI":"10.1109\/ICASSP40776.2020.9054250"},{"key":"ref2","first-page":"7694","article-title":"Effectiveness of self-supervised pre-training for ASR","author":"baevski","year":"0","journal-title":"Proc IEEE Int Conf Acoust Speech Signal Process"},{"key":"ref9","first-page":"7095","article-title":"The speechtransformer for large-scale Mandarin Chinese speech recognition","author":"li","year":"0","journal-title":"Proc IEEE Int Conf Acoust Speech Signal Process"},{"doi-asserted-by":"publisher","key":"ref1","DOI":"10.21437\/Interspeech.2017-111"},{"doi-asserted-by":"publisher","key":"ref20","DOI":"10.1109\/ICASSP.2019.8683602"},{"doi-asserted-by":"publisher","key":"ref22","DOI":"10.21437\/Interspeech.2019-3167"},{"doi-asserted-by":"publisher","key":"ref21","DOI":"10.21437\/Interspeech.2018-1746"},{"doi-asserted-by":"publisher","key":"ref24","DOI":"10.21437\/Interspeech.2020-2404"},{"doi-asserted-by":"publisher","key":"ref23","DOI":"10.21437\/Interspeech.2020-1280"},{"doi-asserted-by":"publisher","key":"ref26","DOI":"10.21437\/Interspeech.2019-1212"},{"year":"2019","author":"chen","article-title":"Listen and fill in the missing letters: Non-autoregressive transformer for speech recognition","key":"ref25"}],"container-title":["IEEE Signal Processing Letters"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/97\/9325893\/09398531.pdf?arnumber=9398531","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,10]],"date-time":"2022-05-10T14:50:31Z","timestamp":1652194231000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9398531\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":37,"URL":"https:\/\/doi.org\/10.1109\/lsp.2021.3071668","relation":{},"ISSN":["1070-9908","1558-2361"],"issn-type":[{"type":"print","value":"1070-9908"},{"type":"electronic","value":"1558-2361"}],"subject":[],"published":{"date-parts":[[2021]]}}}