{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:30:05Z","timestamp":1776889805702,"version":"3.51.2"},"reference-count":50,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,6,6]],"date-time":"2021-06-06T00:00:00Z","timestamp":1622937600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,6,6]],"date-time":"2021-06-06T00:00:00Z","timestamp":1622937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,6,6]],"date-time":"2021-06-06T00:00:00Z","timestamp":1622937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,6,6]]},"DOI":"10.1109\/icassp39728.2021.9414198","type":"proceedings-article","created":{"date-parts":[[2021,5,13]],"date-time":"2021-05-13T19:53:45Z","timestamp":1620935625000},"page":"8363-8367","source":"Crossref","is-referenced-by-count":36,"title":["Improved Mask-CTC for Non-Autoregressive End-to-End ASR"],"prefix":"10.1109","author":[{"given":"Yosuke","family":"Higuchi","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hirofumi","family":"Inaguma","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tetsuji","family":"Ogawa","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tetsunori","family":"Kobayashi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref39","first-page":"3935","article-title":"Enhancing the TED-LIUM corpus with selected data for language modeling and more TED talks","author":"rousseau","year":"2014","journal-title":"Porc of LREC"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.3115\/1075527.1075614"},{"key":"ref33","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2019","journal-title":"Proc of NAACL-HLT"},{"key":"ref32","first-page":"206","article-title":"Exploring neural transducers for end-to-end speech recognition","author":"battenberg","year":"2017","journal-title":"Proc of ASRU"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2086"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1619"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"ref36","article-title":"FastSpeech: Fast, robust and controllable text to speech","author":"ren","year":"2019","journal-title":"Proc of NeurIPS"},{"key":"ref35","article-title":"Understanding and improving transformer from a multi-particle dynamic system point of view","author":"lu","year":"2020","journal-title":"Proc ICLR"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953075"},{"key":"ref28","article-title":"Imputer: Sequence modelling via imputation and dynamic programming","author":"chan","year":"2020","journal-title":"Proc of ICML"},{"key":"ref27","article-title":"Listen and fill in the missing letters: Non-autoregressive Transformer for speech recognition","author":"chen","year":"2019"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2404"},{"key":"ref2","article-title":"Sequence transduction with recurrent neural networks","author":"graves","year":"2012"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref20","first-page":"5976","article-title":"Insertion Transformer: Flexible sequence generation via insertion operations","author":"stern","year":"2019","journal-title":"Proc of ICML"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1633"},{"key":"ref21","first-page":"11181","article-title":"Levenshtein Transformer","author":"gu","year":"2019","journal-title":"Proc of NeurIPS"},{"key":"ref24","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2020.emnlp-main.83","article-title":"Non-autoregressive machine translation with latent alignments","author":"saharia","year":"2020"},{"key":"ref23","article-title":"Semi-autoregressive training improves mask-predict decoding","author":"ghazvininejad","year":"2020"},{"key":"ref26","first-page":"1764","article-title":"Towards end-to-end speech recognition with recurrent neural networks","author":"graves","year":"2014","journal-title":"Proceedings of ICML"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1437"},{"key":"ref50","article-title":"Improved speech-to-text translation with the Fisher and Callhome Spanish&#x2013;English speech translation corpus","author":"post","year":"2013","journal-title":"Proc of IWSLT"},{"key":"ref10","first-page":"5884","article-title":"Speech-Transformer: a no-recurrence sequence-to-sequence model for speech recognition","author":"dong","year":"2018","journal-title":"Proc of ICASSP"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1341"},{"key":"ref40","article-title":"Voxforge","year":"0"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1938"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053889"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2059"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"ref16","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Proc of NeurIPS"},{"key":"ref17","article-title":"Non-autoregressive neural machine translation","author":"gu","year":"2018","journal-title":"Proc of ICLR"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1336"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1149"},{"key":"ref4","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"2015","journal-title":"Proc of ICLR"},{"key":"ref3","first-page":"3104","article-title":"Sequence to sequence learning with neural networks","author":"sutskever","year":"2014","journal-title":"Proc of NeurIPS"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462105"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2012.2205597"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003750"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1780"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1139"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054188"},{"key":"ref46","article-title":"Optimal completion distillation for sequence learning","author":"sabour","year":"2019","journal-title":"Proc of ICLR"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1048"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-demos.34"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1750"},{"key":"ref42","article-title":"Audio augmentation for speech recognition","author":"ko","year":"2015","journal-title":"Proc of Interspeech"},{"key":"ref41","article-title":"The Kaldi speech recognition toolkit","author":"povey","year":"2011","journal-title":"Proc of ASRU"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1162"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"}],"event":{"name":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Toronto, ON, Canada","start":{"date-parts":[[2021,6,6]]},"end":{"date-parts":[[2021,6,11]]}},"container-title":["ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9413349\/9413350\/09414198.pdf?arnumber=9414198","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,27]],"date-time":"2022-12-27T08:29:33Z","timestamp":1672129773000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9414198\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,6,6]]},"references-count":50,"URL":"https:\/\/doi.org\/10.1109\/icassp39728.2021.9414198","relation":{},"subject":[],"published":{"date-parts":[[2021,6,6]]}}}