{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,16]],"date-time":"2026-07-16T02:34:02Z","timestamp":1784169242551,"version":"3.55.0"},"reference-count":24,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2013,12]]},"DOI":"10.1109\/asru.2013.6707758","type":"proceedings-article","created":{"date-parts":[[2014,1,10]],"date-time":"2014-01-10T20:07:23Z","timestamp":1389384443000},"page":"368-373","source":"Crossref","is-referenced-by-count":106,"title":["Large scale deep neural network acoustic modeling with semi-supervised training data for YouTube video transcription"],"prefix":"10.1109","author":[{"given":"Hank","family":"Liao","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Erik","family":"McDermott","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Andrew","family":"Senior","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2011.5947611"},{"key":"22","article-title":"Large scale language modeling in automatic speech recognition","author":"chelba","year":"2012","journal-title":"Google Tech Rep"},{"key":"17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6638949"},{"key":"23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6638951"},{"key":"18","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2012-7","article-title":"Pipelined back-propagation for context-dependent deep neural networks","author":"chen","year":"2012","journal-title":"Proc INTERSPEECH"},{"key":"24","article-title":"Large language models in machine translation","author":"brants","year":"2007","journal-title":"Proc of Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning"},{"key":"15","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2011.6163899"},{"key":"16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639212"},{"key":"13","doi-asserted-by":"publisher","DOI":"10.1109\/89.906002"},{"key":"14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6638963"},{"key":"11","doi-asserted-by":"publisher","DOI":"10.1006\/csla.2001.0186"},{"key":"12","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2011.06.001"},{"key":"21","doi-asserted-by":"publisher","DOI":"10.3115\/1075812.1075885"},{"key":"3","article-title":"Hybrid neural network\/hidden markov model continuous-speech recognition","author":"cohen","year":"1992","journal-title":"Proc EUROSPEECH"},{"key":"20","article-title":"Cudamat: A cuda-based matrix class for python","author":"mnih","year":"2009","journal-title":"Tech Rep TR 2009-004 Department of Computer Science"},{"key":"2","article-title":"Speech activity detection on youtube using deep neural networks","author":"ryant","year":"2013","journal-title":"Proc INTERSPEECH"},{"key":"1","author":"alberti","year":"2009","journal-title":"Automatic Captioning in YouTube"},{"key":"10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2009.4960722"},{"key":"7","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2011.6163900"},{"key":"6","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2109382"},{"key":"5","article-title":"Roles of pretraining and fine-tuning in context-dependent dnn-hmms for real-world speech recognition","author":"yu","year":"2010","journal-title":"Proc NIPS Workshop on Deep Learning and Unsupervised Feature Learning"},{"key":"4","doi-asserted-by":"publisher","DOI":"10.1038\/323533a0"},{"key":"9","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2012-10","article-title":"Application of pretrained deep neural networks to large vocabulary speech recognition","author":"jaitly","year":"2012","journal-title":"Proc INTERSPEECH"},{"key":"8","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2011-169","article-title":"Conversational speech transcription using context-dependent deep neural networks","author":"seide","year":"2011","journal-title":"Proc INTERSPEECH"}],"event":{"name":"2013 IEEE Workshop on Automatic Speech Recognition & Understanding (ASRU)","location":"Olomouc, Czech Republic","start":{"date-parts":[[2013,12,8]]},"end":{"date-parts":[[2013,12,12]]}},"container-title":["2013 IEEE Workshop on Automatic Speech Recognition and Understanding"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6695806\/6707689\/06707758.pdf?arnumber=6707758","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,3,23]],"date-time":"2022-03-23T00:59:08Z","timestamp":1647997148000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/6707758\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2013,12]]},"references-count":24,"URL":"https:\/\/doi.org\/10.1109\/asru.2013.6707758","relation":{},"subject":[],"published":{"date-parts":[[2013,12]]}}}