{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:29:47Z","timestamp":1775230187435,"version":"3.50.1"},"reference-count":42,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,12,13]],"date-time":"2021-12-13T00:00:00Z","timestamp":1639353600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,12,13]],"date-time":"2021-12-13T00:00:00Z","timestamp":1639353600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,12,13]]},"DOI":"10.1109\/asru51503.2021.9688251","type":"proceedings-article","created":{"date-parts":[[2022,2,3]],"date-time":"2022-02-03T20:31:00Z","timestamp":1643920260000},"page":"16-23","source":"Crossref","is-referenced-by-count":18,"title":["A Study of Transducer Based End-to-End ASR with ESPnet: Architecture, Auxiliary Loss and Decoding Strategies"],"prefix":"10.1109","author":[{"given":"Florian","family":"Boyer","sequence":"first","affiliation":[{"name":"Airudit, Speech Lab."}]},{"given":"Yusuke","family":"Shinohara","sequence":"additional","affiliation":[{"name":"Yahoo Japan Corporation"}]},{"given":"Takaaki","family":"Ishii","sequence":"additional","affiliation":[{"name":"Yahoo Japan Corporation"}]},{"given":"Hirofumi","family":"Inaguma","sequence":"additional","affiliation":[{"name":"Kyoto University"}]},{"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}]}],"member":"263","reference":[{"key":"ref39","article-title":"Improving RNN-transducer with normalized join-ter network","author":"huang","year":"2020","journal-title":"arxiv 2011 01576 (arXv preprint)"},{"key":"ref38","first-page":"4395","article-title":"Self-attention trans-ducers for end-to-end speech recognition","author":"tian","year":"0","journal-title":"Proceedings of the Conference of the International Speech Communication As-sociation"},{"key":"ref33","article-title":"Accelerating RNN transducer infer-ence via one-step constrained beam search","author":"kim","year":"2020","journal-title":"arxiv 2011 01576 (arXv preprint)"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8268948"},{"key":"ref31","first-page":"2818","article-title":"Re-thinking the inception architecture for computer vision","author":"szegedy","year":"0","journal-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953075"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054663"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICSDA.2017.8384449"},{"key":"ref35","first-page":"5206","article-title":"Lib-riSpeech: An ASR corpus based on public domain audio books","author":"panayotov","year":"0","journal-title":"Proceedings of the IEEE International Conference on Acoustics Speech & Signal Processing"},{"key":"ref34","year":"2019","journal-title":"Voxforge (italian)"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053889"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1510"},{"key":"ref11","article-title":"TensorFlowASR","author":"nguyen","year":"2020","journal-title":"Github Repository"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462506"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"ref14","article-title":"Speechbrain","author":"ravanelli","year":"2021","journal-title":"Github Repository"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1898"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383606"},{"key":"ref17","first-page":"944","article-title":"Monotonic re-current neural network transducer and decoding strategies","author":"tripathi","year":"0","journal-title":"Proceedings of the IEEE Automatic Speech Recognition and Understanding Workshop"},{"key":"ref18","article-title":"RNN-T for latency controlled ASR with improved beam ssearch","author":"jain","year":"2019","journal-title":"arXiv 1911 01629 (arXiv preprint)"},{"key":"ref19","first-page":"7804","article-title":"Alignment-length syn-chronous decoding for RNN transducer","author":"saon","year":"0","journal-title":"Proceedings of the IEEE International Conference on Acoustics Speech & Signal Processing"},{"key":"ref28","article-title":"Multitask learning and joint opti-mization for transformer-RNN-transducer speech recognition","author":"jeon","year":"2020","journal-title":"arxiv 2011 00771 (arXv preprint)"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682336"},{"key":"ref27","article-title":"Re-cent developments on ESPnet toolkit boosted by conformer","author":"pengcheng","year":"0","journal-title":"Proceedings of the IEEE International Conference on Acous-tics Speech & Signal Processing"},{"key":"ref3","article-title":"Sequence transduction with recurrent neural net-works","author":"graves","year":"2012","journal-title":"arXiv 1211 3711 (arXiv preprint)"},{"key":"ref6","first-page":"966","article-title":"Minimum Bayes Risk training of RNN-transducer for end-to-end speech recog-nition","author":"weng","year":"0","journal-title":"Proceedings of the Conference of the International Speech Communication Association"},{"key":"ref29","first-page":"193","article-title":"Exploring architectures, data and units for streaming end-to-end speech recognition with RNN transducer","author":"rao","year":"0","journal-title":"Proceedings of the IEEE Automatic Speech Recognition and Understanding Workshop"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053896"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-4022"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1705"},{"key":"ref2","article-title":"End-to-end attention-based large vocabulary speech recognition","author":"bahdanau","year":"2015","journal-title":"arxiv 1508 04395 (arXiv preprint)"},{"key":"ref9","article-title":"NeMo: A toolkit for building AI applications using neural modules","author":"kuchaiev","year":"2019","journal-title":"arxiv 1909 09577 (arXv preprint)"},{"key":"ref1","first-page":"369","article-title":"Con-nectionist temporal classification: Labelling unsegmented se-quence data with recurrent neural networks","author":"graves","year":"0","journal-title":"Proceedings of the International Conference on Machine Learning"},{"key":"ref20","article-title":"Reinforcement learning with unsupervised auxiliary tasks","author":"jadeberg","year":"2019","journal-title":"arXiv 1611 05397 (arXiv preprint)"},{"key":"ref22","article-title":"Improving RNN transducer based ASR with auxiliary tasks","author":"liu","year":"2020","journal-title":"arxiv 2011 03109 (arXiv preprint)"},{"key":"ref21","first-page":"3532","article-title":"Multitasklearning with low-level auxiliary tasks for encoder-decoder based speech recognition","author":"toshniwal","year":"0","journal-title":"Proceedings of the Conference of the International Speech Communication Association"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414198"},{"key":"ref24","first-page":"5060","article-title":"On training the recurrent neu-ral network encoder-decoder for large vocabulary end-to-end speech recognition","author":"lu","year":"0","journal-title":"Proceedings of the IEEE International Conference on Acoustics Speech & Signal Processing"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2404"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053887"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003750"}],"event":{"name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Cartagena, Colombia","start":{"date-parts":[[2021,12,13]]},"end":{"date-parts":[[2021,12,17]]}},"container-title":["2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9687821\/9687855\/09688251.pdf?arnumber=9688251","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,16]],"date-time":"2022-05-16T20:41:14Z","timestamp":1652733674000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9688251\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,12,13]]},"references-count":42,"URL":"https:\/\/doi.org\/10.1109\/asru51503.2021.9688251","relation":{},"subject":[],"published":{"date-parts":[[2021,12,13]]}}}