{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T05:30:33Z","timestamp":1730266233129,"version":"3.28.0"},"reference-count":39,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,7]]},"DOI":"10.1109\/ijcnn.2018.8489589","type":"proceedings-article","created":{"date-parts":[[2018,10,19]],"date-time":"2018-10-19T22:25:09Z","timestamp":1539987909000},"page":"1-8","source":"Crossref","is-referenced-by-count":3,"title":["Syllable-Based Acoustic Modeling with CTC for Multi-Scenarios Mandarin speech recognition"],"prefix":"10.1109","author":[{"given":"Yuanyuan","family":"Zhao","sequence":"first","affiliation":[]},{"given":"Linhao","family":"Dong","sequence":"additional","affiliation":[]},{"given":"Shuang","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Bo","family":"Xu","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6854675"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178839"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.3115\/1075812.1075885"},{"key":"ref32","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-642-24797-2","author":"graves","year":"2012","journal-title":"Supervised Sequence Labelling with Recurrent Neural Networks"},{"key":"ref31","doi-asserted-by":"crossref","first-page":"709","DOI":"10.21437\/Interspeech.2017-505","article-title":"Ctc training of multi-phone acoustic models for speech recognition","author":"siohan","year":"2017","journal-title":"Proc Interspeech 2017"},{"key":"ref30","first-page":"173","article-title":"Deep speech 2: End-to-end speech recognition in english and mandarin","author":"amodei","year":"2016","journal-title":"International Conference on Machine Learning"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1990.2.4.490"},{"key":"ref36","first-page":"153","article-title":"Greedy layer-wise training of deep networks","volume":"19","author":"bengio","year":"2007","journal-title":"Advances in neural information processing systems"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6638950"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2015.7404790"},{"key":"ref10","article-title":"Audio augmentation for speech recognition","author":"ko","year":"2015","journal-title":"Sixteenth Annual Conference of the International Speech Communication Association"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2012.6288864"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639347"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2014.2339736"},{"key":"ref14","article-title":"Ensemble deep learning for speech recognition","author":"deng","year":"2014","journal-title":"Fifteenth Annual Conference of the International Speech Communication Association"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178838"},{"key":"ref16","article-title":"Learning the speech front-end with raw waveform cldnns","author":"sainath","year":"2015","journal-title":"Sixteenth Annual Conference of the International Speech Communication Association"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178847"},{"key":"ref18","article-title":"Advances in joint ctc-attention based end-to-end speech recognition with a deep cnn encoder and rnn-lm","author":"hori","year":"2017","journal-title":"arXiv preprint arXiv 1706 02737"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953077"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2013.6694176"},{"key":"ref4","article-title":"Syllable-based acoustic modeling with ctc-smbr-lstm","author":"zhongdi qu","year":"2017","journal-title":"IEEE Automatic Speech Recognition and Understanding Workshop"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2012.2210952"},{"key":"ref3","article-title":"Svd-based universal dnn modeling for multiple scenarios","author":"liu","year":"2015","journal-title":"Sixteenth Annual Conference of the International Speech Communication Association"},{"key":"ref6","article-title":"Robust speech recognition using generative adversarial networks","author":"sriram","year":"2017","journal-title":"arXiv preprint arXiv 1711 01567"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472152"},{"key":"ref5","article-title":"Towards better performance with heterogeneous training data in acoustic modeling using deep neural networks","author":"huang","year":"2014","journal-title":"Fifteenth Annual Conference of the International Speech Communication Association"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2013.6707748"},{"article-title":"Cross-domain speech recognition using nonparallel corpora with cycle-consistent adversarial networks","year":"0","author":"mimura","key":"ref7"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-70136-3_91"},{"key":"ref9","article-title":"Vocal tract length perturbation (vtlp) improves speech recognition","volume":"117","author":"jaitly","year":"2013","journal-title":"ICML Workshop on Deep Learning for Audio Speech and Language Processing"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2012.6424210"},{"key":"ref20","article-title":"Layer normalization","author":"ba","year":"2016","journal-title":"arXiv preprint arXiv 1607 06450"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/89.917681"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1980.1170934"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-6393(99)00050-3"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.1997.659007"},{"key":"ref26","doi-asserted-by":"crossref","first-page":"3","DOI":"10.1155\/2007\/46460","article-title":"On the utility of syllable-based acoustic models for pronunciation variation modelling","volume":"2007","author":"h\u00e4m\u00e4l\u00e4inen","year":"2007","journal-title":"EURASIP Journal on Audio Speech and Music Processing"},{"article-title":"Syllable-length acoustic units in large-vocabulary continuous speech recognition","year":"2005","author":"h\u00e4m\u00e4l\u00e4inen","key":"ref25"}],"event":{"name":"2018 International Joint Conference on Neural Networks (IJCNN)","start":{"date-parts":[[2018,7,8]]},"location":"Rio de Janeiro","end":{"date-parts":[[2018,7,13]]}},"container-title":["2018 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8465565\/8488986\/08489589.pdf?arnumber=8489589","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,8,24]],"date-time":"2020-08-24T01:56:03Z","timestamp":1598234163000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8489589\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,7]]},"references-count":39,"URL":"https:\/\/doi.org\/10.1109\/ijcnn.2018.8489589","relation":{},"subject":[],"published":{"date-parts":[[2018,7]]}}}