{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,28]],"date-time":"2026-06-28T05:33:29Z","timestamp":1782624809057,"version":"3.54.5"},"reference-count":62,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"6","license":[{"start":{"date-parts":[[2019,11,1]],"date-time":"2019-11-01T00:00:00Z","timestamp":1572566400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2019,11,1]],"date-time":"2019-11-01T00:00:00Z","timestamp":1572566400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2019,11,1]],"date-time":"2019-11-01T00:00:00Z","timestamp":1572566400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Signal Process. Mag."],"published-print":{"date-parts":[[2019,11]]},"DOI":"10.1109\/msp.2019.2918706","type":"journal-article","created":{"date-parts":[[2019,10,30]],"date-time":"2019-10-30T20:19:07Z","timestamp":1572466747000},"page":"111-124","source":"Crossref","is-referenced-by-count":136,"title":["Speech Processing for Digital Home Assistants: Combining Signal Processing With Deep-Learning Techniques"],"prefix":"10.1109","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9468-7330","authenticated-orcid":false,"given":"Reinhold","family":"Haeb-Umbach","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5970-8631","authenticated-orcid":false,"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7487-7150","authenticated-orcid":false,"given":"Tomohiro","family":"Nakatani","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4527-0197","authenticated-orcid":false,"given":"Michiel","family":"Bacchiani","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bjorn","family":"Hoffmeister","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Michael L.","family":"Seltzer","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Heiga","family":"Zen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mehrez","family":"Souden","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref39","first-page":"399","article-title":"Acoustic modeling for Google Home","author":"li","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref38","article-title":"Multichannel end-to-end speech recognition","author":"ochiai","year":"0","journal-title":"Proc Int Conf Machine Learning (ICML)"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/TSA.2004.832988"},{"key":"ref32","first-page":"379","article-title":"Generation of large-scale simulated utterances in virtual rooms to train deep-neural networks for far-field speech recognition in Google Home","author":"kim","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/IWAENC.2014.6954309"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1121\/1.382599"},{"key":"ref37","article-title":"State-of-the-art speech recognition with sequence-to-sequence models","author":"chiu","year":"2017"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953173"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472778"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2017.2672401"},{"key":"ref60","first-page":"1487","article-title":"End-to-end text-independent speaker verification with triplet loss on short utterances","author":"zhang","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref62","year":"0","journal-title":"Signal processing in home assistants"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952154"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7471631"},{"key":"ref29","article-title":"Deep residual learning for image recognition","author":"he","year":"2015"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1768"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2016.11.005"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TSP.2004.828896"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7471664"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2016.2647702"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2016.11.007"},{"key":"ref23","first-page":"1981","article-title":"Improved MVDR beamforming using single-channel mask prediction networks","author":"erdogan","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref26","first-page":"114","article-title":"A survey of convolutive blind source separation methods","author":"pedersen","year":"2007","journal-title":"Springer Handbook on Speech Processing and Speech Communication"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462069"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462227"},{"key":"ref51","article-title":"Hey Siri: An on-device DNN-powered voice trigger for Apple&#x2019;s personal assistant","volume":"1","year":"2017","journal-title":"Mach Learn J"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2016.7846261"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6854363"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-1346"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1205"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1531"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461478"},{"key":"ref53","first-page":"2912","article-title":"Accurate endpointing with expected pause duration","author":"liu","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref52","first-page":"1909","article-title":"Improved end-of-query detection for streaming speech recognition","author":"shannon","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2012.2210879"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2196"},{"key":"ref40","first-page":"468","article-title":"Bringing contextual information to Google speech recognition","author":"aleksic","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2009.4960438"},{"key":"ref13","first-page":"2017","article-title":"Adaptive multichannel dereverberation for automatic speech recognition","author":"caroselli","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref14","year":"0","journal-title":"The 5th CHiME speech separation and recognition challenge Results"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461669"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462372"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/EUSIPCO.2016.7760429"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2010.5495994"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2050716"},{"key":"ref4","first-page":"547","article-title":"The automatic speech recognition in reverberant environments (ASpIRE) challenge","author":"harper","year":"0","journal-title":"Proc IEEE Workshop Automatic Speech Recognition and Understanding (ASRU)"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1186\/s13634-016-0306-6"},{"key":"ref6","article-title":"Optimizing Siri on HomePod in far-field settings","volume":"1","year":"2018","journal-title":"Mach Learn J"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2012.2205029"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2014.2372342"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-04437-7"},{"key":"ref49","first-page":"220","article-title":"An application of recurrent neural networks to discriminative keyword spotting","author":"fern\u00e1ndez","year":"0","journal-title":"Proc Intl Conf on Artificial Neural Networks"},{"key":"ref9","first-page":"85","article-title":"Blind speech dereverberation with multi-channel linear prediction based on short time Fourier transform representation","author":"nakatani","year":"0","journal-title":"Proc IEEE Int Conf Acoustics Speech and Signal Processing (ICASSP)"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6854370"},{"key":"ref45","article-title":"Wavenet: A generative model for raw audio","author":"van den oord","year":"2016"},{"key":"ref48","first-page":"252","article-title":"Direct modeling of raw audio with DNNs for wake word detection","author":"kumatani","year":"0","journal-title":"Proc IEEE Workshop Automatic Speech Recognition and Understanding (ASRU)"},{"key":"ref47","first-page":"1478","article-title":"Convolutional neural networks for small-footprint keyword spotting","author":"sainath","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2012.6289013"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1122"},{"key":"ref44","first-page":"3918","article-title":"Parallel WaveNet: Fast high-fidelity speech synthesis","author":"van den oord","year":"0","journal-title":"Proc Int Conf Machine Learning (ICML)"},{"key":"ref43","first-page":"4011","article-title":"Siri on-device deep learning-guided unit selection text-to-speech system","author":"capes","year":"0","journal-title":"Proc INTERSPEECH"}],"container-title":["IEEE Signal Processing Magazine"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/79\/8887548\/08887564.pdf?arnumber=8887564","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,7,13]],"date-time":"2022-07-13T21:07:36Z","timestamp":1657746456000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8887564\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,11]]},"references-count":62,"journal-issue":{"issue":"6"},"URL":"https:\/\/doi.org\/10.1109\/msp.2019.2918706","relation":{},"ISSN":["1053-5888","1558-0792"],"issn-type":[{"value":"1053-5888","type":"print"},{"value":"1558-0792","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019,11]]}}}