{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:48:26Z","timestamp":1776887306405,"version":"3.51.2"},"reference-count":42,"publisher":"IEEE","license":[{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020,5]]},"DOI":"10.1109\/icassp40776.2020.9054345","type":"proceedings-article","created":{"date-parts":[[2020,4,9]],"date-time":"2020-04-09T20:21:13Z","timestamp":1586463673000},"page":"6874-6878","source":"Crossref","is-referenced-by-count":148,"title":["Transformer-Based Acoustic Modeling for Hybrid Speech Recognition"],"prefix":"10.1109","author":[{"given":"Yongqiang","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Abdelrahman","family":"Mohamed","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Due","family":"Le","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chunxi","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alex","family":"Xiao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jay","family":"Mahadeokar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hongzhao","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Andros","family":"Tjandra","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaohui","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Frank","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Christian","family":"Fuegen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Geoffrey","family":"Zweig","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Michael L.","family":"Seltzer","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref39","author":"simonyan","year":"2014","journal-title":"Very Deep Convolutional Networks for Large-scale Image Recognition"},{"key":"ref38","author":"luscher","year":"2019","journal-title":"RWTH ASR systems for Lib-riSpeech hybrid vs attention-w\/o data augmentation"},{"key":"ref33","article-title":"The kaldi speech recognition toolkit","author":"povey","year":"2011","journal-title":"IEEE Workshop on Automatic Speech Recognition and Understanding"},{"key":"ref32","first-page":"5206","article-title":"Librispeech: an asr corpus based on public domain audio books","author":"panayotov","year":"2015","journal-title":"Proc ICASSP"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33013159"},{"key":"ref30","author":"karita","year":"2019","journal-title":"A comprative study on Transformer vs RNN in speech applications"},{"key":"ref37","author":"kingma","year":"2014","journal-title":"Adam A method for stochastic optimization"},{"key":"ref36","article-title":"fairseq: A Fast, Extensible Toolkit for Sequence Modeling","author":"myle","year":"2019","journal-title":"Proceedings of NAACL-HLT 2019 Demonstrations"},{"key":"ref35","doi-asserted-by":"crossref","first-page":"2345","DOI":"10.21437\/Interspeech.2013-548","article-title":"Sequence-discriminative training of deep neural networks","volume":"2013","author":"vesely","year":"2013","journal-title":"Proc INTERSPEECH"},{"key":"ref34","article-title":"Audio augmentation for speech recognition","author":"ko","year":"2015","journal-title":"Proc INTERSPEECH"},{"key":"ref10","author":"park","year":"2019","journal-title":"SpecAugment A simple data augmentation method for automatic speech recognition"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472780"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682336"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/72.279181"},{"key":"ref13","author":"collobert","year":"2016","journal-title":"Wav2letter an end-to-end convnet-based speech recognition system"},{"key":"ref14","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Proc NIPS"},{"key":"ref15","author":"devlin","year":"2018","journal-title":"BERT Pre-training of deep bidirectional transformers for language understanding"},{"key":"ref16","author":"radford","year":"2018","journal-title":"Improving language understanding by generative pre-training"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462506"},{"key":"ref18","author":"sperber","year":"2018","journal-title":"Self-attentional acoustic models"},{"key":"ref19","author":"zhou","year":"2018","journal-title":"Syllable-based sequence-to-sequence speech recognition with the transformer in mandarin chinese"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2014.2339736"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953077"},{"key":"ref3","article-title":"Long short-term memory recurrent neural network architectures for large scale acoustic modeling","author":"sak","year":"2014","journal-title":"Proc INTERSPEECH"},{"key":"ref27","author":"mohamed","year":"2019","journal-title":"Transformers with convolutional context for ASR"},{"key":"ref6","author":"zhang","year":"2015","journal-title":"Feed-forward sequential memory neural networks without recurrent feedback"},{"key":"ref5","article-title":"A time delay neural network architecture for efficient modeling of long temporal contexts","author":"peddinti","year":"2015","journal-title":"Proc INTERSPEECH"},{"key":"ref29","article-title":"Deja-vu: Double feature presentation in deep transformer networks","author":"tjandra","year":"2020","journal-title":"Submitted to ICASSP"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472618"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref2","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2011-169","article-title":"Conversational speech transcription using context-dependent deep neural networks","author":"seide","year":"2011","journal-title":"Proc INTERSPEECH"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462105"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2012.2205597"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462497"},{"key":"ref22","volume":"247","author":"bourlard","year":"0","journal-title":"Connectionist Speech Recognition A Hybrid Approach"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682539"},{"key":"ref42","author":"chen","year":"2018","journal-title":"The best of both worlds Combining recent advances in neural machine translation"},{"key":"ref24","author":"le","year":"2019","journal-title":"From senones to chenones Tied context-dependent graphemes for hybrid speech recognition"},{"key":"ref41","author":"han","year":"2019","journal-title":"State-of-the-art speech recognition using multi-stream self-attention with dilated 1d convolutions"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1992.225979"},{"key":"ref26","author":"hendrycks","year":"2016","journal-title":"Gaussian Error Linear Units (GELUs)[J]"},{"key":"ref25","author":"lei ba","year":"2016","journal-title":"Layer normalization"}],"event":{"name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Barcelona, Spain","start":{"date-parts":[[2020,5,4]]},"end":{"date-parts":[[2020,5,8]]}},"container-title":["ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9040208\/9052899\/09054345.pdf?arnumber=9054345","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,3]],"date-time":"2024-08-03T19:26:39Z","timestamp":1722713199000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9054345\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,5]]},"references-count":42,"URL":"https:\/\/doi.org\/10.1109\/icassp40776.2020.9054345","relation":{},"subject":[],"published":{"date-parts":[[2020,5]]}}}