{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:22:37Z","timestamp":1775229757703,"version":"3.50.1"},"reference-count":30,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,6,6]],"date-time":"2021-06-06T00:00:00Z","timestamp":1622937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,6,6]],"date-time":"2021-06-06T00:00:00Z","timestamp":1622937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,6,6]]},"DOI":"10.1109\/icassp39728.2021.9415058","type":"proceedings-article","created":{"date-parts":[[2021,5,13]],"date-time":"2021-05-13T19:53:45Z","timestamp":1620935625000},"page":"6209-6213","source":"Crossref","is-referenced-by-count":33,"title":["A General Multi-Task Learning Framework to Leverage Text Data for Speech to Text Tasks"],"prefix":"10.1109","author":[{"given":"Yun","family":"Tang","sequence":"first","affiliation":[{"name":"Facebook AI,USA"}]},{"given":"Juan","family":"Pino","sequence":"additional","affiliation":[{"name":"Facebook AI,USA"}]},{"given":"Changhan","family":"Wang","sequence":"additional","affiliation":[{"name":"Facebook AI,USA"}]},{"given":"Xutai","family":"Ma","sequence":"additional","affiliation":[{"name":"Facebook AI,USA"}]},{"given":"Dmitriy","family":"Genzel","sequence":"additional","affiliation":[{"name":"Facebook AI,USA"}]}],"member":"263","reference":[{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2938"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.217"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953152"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref13","article-title":"Towards better decoding and language model integration in sequence to sequence models","author":"chorowski","year":"2016","journal-title":"InterSpeech"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003990"},{"key":"ref15","article-title":"Cold fusion: Training seq2seq models together with language models","author":"sriram","year":"2017","journal-title":"InterSpeech"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1554"},{"key":"ref17","article-title":"Leveraging weakly supervised data to improve end-to-end speech-to-text translation","author":"jia","year":"2018","journal-title":"ICASSP"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003774"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053281"},{"key":"ref28","article-title":"Learning pronunciation from a foreign language in speech synthesis networks","author":"lee","year":"2018"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-503"},{"key":"ref27","article-title":"MuST-C: a multilingual speech translation corpus","author":"gangi","year":"2019","journal-title":"NAACL-HLT"},{"key":"ref3","article-title":"Listen and translate: A proof of concept for end-to-end speech-to-text translation","author":"berard","year":"2016","journal-title":"NIPS"},{"key":"ref6","article-title":"fairseq s2t: Fast speech-to-text modeling with fairseq","author":"wang","year":"2020","journal-title":"AACL"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9004003"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-demos.34"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1780"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"ref9","article-title":"The IWSLT 2019 evaluation campaign","author":"niehues","year":"2019"},{"key":"ref1","article-title":"Attention-based models for speech recognition","author":"chorowski","year":"2015","journal-title":"NIPS"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2456"},{"key":"ref22","article-title":"Unsupervised machine translation using monolingual corpora only","author":"lample","year":"2018","journal-title":"ICLRE"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1008"},{"key":"ref24","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"NIPS"},{"key":"ref23","article-title":"Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension","author":"lewis","year":"2020","journal-title":"ACL"},{"key":"ref26","article-title":"On layer normalization in the transformer architecture","author":"xiong","year":"2019"},{"key":"ref25","article-title":"End-to-end asr: from supervised to semi-supervised learning with modern architectures","author":"synnaeve","year":"2020"}],"event":{"name":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Toronto, ON, Canada","start":{"date-parts":[[2021,6,6]]},"end":{"date-parts":[[2021,6,11]]}},"container-title":["ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9413349\/9413350\/09415058.pdf?arnumber=9415058","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,3]],"date-time":"2022-08-03T00:21:02Z","timestamp":1659486062000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9415058\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,6,6]]},"references-count":30,"URL":"https:\/\/doi.org\/10.1109\/icassp39728.2021.9415058","relation":{},"subject":[],"published":{"date-parts":[[2021,6,6]]}}}