{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T18:10:49Z","timestamp":1775326249765,"version":"3.50.1"},"reference-count":62,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T00:00:00Z","timestamp":1673222400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T00:00:00Z","timestamp":1673222400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,1,9]]},"DOI":"10.1109\/slt54892.2023.10022656","type":"proceedings-article","created":{"date-parts":[[2023,1,27]],"date-time":"2023-01-27T18:54:03Z","timestamp":1674845643000},"page":"84-91","source":"Crossref","is-referenced-by-count":85,"title":["E-Branchformer: Branchformer with Enhanced Merging for Speech Recognition"],"prefix":"10.1109","author":[{"given":"Kwangyoun","family":"Kim","sequence":"first","affiliation":[{"name":"ASAPP Inc.,Mountain View,CA,USA"}]},{"given":"Felix","family":"Wu","sequence":"additional","affiliation":[{"name":"ASAPP Inc.,Mountain View,CA,USA"}]},{"given":"Yifan","family":"Peng","sequence":"additional","affiliation":[{"name":"ASAPP Inc.,Mountain View,CA,USA"}]},{"given":"Jing","family":"Pan","sequence":"additional","affiliation":[{"name":"ASAPP Inc.,Mountain View,CA,USA"}]},{"given":"Prashant","family":"Sridhar","sequence":"additional","affiliation":[{"name":"ASAPP Inc.,Mountain View,CA,USA"}]},{"given":"Kyu J.","family":"Han","sequence":"additional","affiliation":[{"name":"ASAPP Inc.,Mountain View,CA,USA"}]},{"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,Pittsburgh,PA,USA"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Asr-glue: A new multi-task bench-mark for asr-robust natural language understanding","author":"Feng","year":"2021","journal-title":"arXiv preprint"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.588"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/icassp43922.2022.9746137"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/icassp.2016.7472621"},{"key":"ref5","article-title":"Sequence transduction with recurrent neural net-works","author":"Graves","year":"2012","journal-title":"arXiv preprint"},{"key":"ref6","article-title":"Transformer transducer: A streamable speech recognition model with trans-former encoders and rnn-t loss","author":"Zhang","year":"2020","journal-title":"ICASSP"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref8","article-title":"Joint etc-attention based end-to-end speech recognition using multi-task learning","author":"Kim","year":"2017","journal-title":"ICASSP"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-711"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-2680"},{"key":"ref11","article-title":"Se-mantic mask for transformer based end-to-end speech recognition","author":"Wang","year":"2019","journal-title":"arXiv preprint"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8268935"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1616"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9004027"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-1819"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053887"},{"key":"ref17","article-title":"Contextnet: Improving convolutional neural networks for automatic speech recognition with global con-text","author":"Han","year":"2020","journal-title":"arXiv preprint"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/icassp.2018.8462506"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003750"},{"key":"ref20","article-title":"Streaming au-tomatic speech recognition with the transformer model","author":"Moritz","year":"2020","journal-title":"ICASSP)"},{"key":"ref21","article-title":"Rethinking attention with performers","author":"Choromanski","year":"2020","journal-title":"arXiv preprint"},{"key":"ref22","article-title":"Linformer: Self-attention with linear complexity","author":"Wang","year":"2020","journal-title":"arXiv preprint"},{"key":"ref23","article-title":"An attention free transformer","author":"Zhai","year":"2021","journal-title":"arXiv preprint"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3530811"},{"key":"ref25","article-title":"Fastformer: Additive attention can be all you need","author":"Wu","year":"2021","journal-title":"arXiv preprint"},{"key":"ref26","article-title":"Deepnet: Scaling transformers to 1, 000 layers","author":"Wang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/icassp43922.2022.9746187"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"ref29","article-title":"Branchformer: Parallel mlp-attention architectures to capture local and global context for speech recognition and understanding","author":"Peng","year":"2022","journal-title":"ICML"},{"key":"ref30","article-title":"Deep speech: Scaling up end-to-end speech recognition","author":"Hannun","year":"2014","journal-title":"arXiv preprint"},{"key":"ref31","article-title":"Qanet: Combining local convolution with global self-attention for reading comprehension","author":"Yu","year":"2018","journal-title":"arXiv preprint"},{"key":"ref32","article-title":"The evolved trans-former","author":"So","year":"2019","journal-title":"ICML"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00009"},{"key":"ref35","article-title":"Coatnet: Marrying convolution and attention for all data sizes","author":"Dai","year":"2021","journal-title":"NeurIPS"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01739-w"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20053-3_27"},{"key":"ref38","article-title":"Lite transformer with long-short range attention","author":"Wu","year":"2020","journal-title":"ICLR"},{"key":"ref39","article-title":"Pay less attention with lightweight and dy-namic convolutions","author":"Wu","year":"2019","journal-title":"ICLR"},{"key":"ref40","article-title":"Convbert: Improving bert with span-based dynamic convolution","author":"Jiang","year":"2020","journal-title":"NeurIPS"},{"key":"ref41","article-title":"Electra: Pre-training text encoders as discriminators rather than generators","author":"Clark","year":"2020","journal-title":"arXiv preprint"},{"key":"ref42","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018","journal-title":"arXiv preprint"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00089"},{"key":"ref44","volume-title":"Mlp-based architecture with variable length input for automatic speech recognition","author":"Sakuma","year":"2021"},{"key":"ref45","article-title":"Inception transformer","author":"Si","year":"2022","journal-title":"arXiv preprint"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_38"},{"key":"ref47","article-title":"Layer normalization","author":"Ba","year":"2016","journal-title":"arXiv preprint"},{"key":"ref48","article-title":"Dropout: a simple way to prevent neural networks from overfitting","author":"Srivastava","year":"2014","journal-title":"JMLR"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-2074"},{"key":"ref50","article-title":"Gaussian error linear units (gelus)","author":"Hendrycks","year":"2016","journal-title":"arXiv preprint"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01166"},{"key":"ref52","volume-title":"More convnets in the 2020s: Scaling up kernels beyond 51x51 using sparsity","author":"Liu","year":"2022"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"ref54","article-title":"Searching for activation functions","author":"Ramachandran","year":"2017","journal-title":"arXiv preprint"},{"key":"ref55","article-title":"Improving trans-former models by reordering their sublayers","author":"Press","year":"2020","journal-title":"ACL"},{"key":"ref56","article-title":"Librispeech: an asr corpus based on public do-main audio books","author":"Panayotov","year":"2015","journal-title":"ICASSP"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383515"},{"key":"ref59","article-title":"Re-cent developments on espnet toolkit boosted by conformer","author":"Guo","year":"2021","journal-title":"ICASSP"},{"key":"ref60","article-title":"End-to-end asr: from supervised to semi-supervised learning with modern architectures","author":"Synnaeve","year":"2019","journal-title":"arXiv preprint"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688253"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052942"}],"event":{"name":"2022 IEEE Spoken Language Technology Workshop (SLT)","location":"Doha, Qatar","start":{"date-parts":[[2023,1,9]]},"end":{"date-parts":[[2023,1,12]]}},"container-title":["2022 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10022052\/10022330\/10022656.pdf?arnumber=10022656","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,13]],"date-time":"2024-02-13T08:10:05Z","timestamp":1707811805000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10022656\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,1,9]]},"references-count":62,"URL":"https:\/\/doi.org\/10.1109\/slt54892.2023.10022656","relation":{},"subject":[],"published":{"date-parts":[[2023,1,9]]}}}