{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:08:26Z","timestamp":1775228906814,"version":"3.50.1"},"reference-count":87,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","license":[{"start":{"date-parts":[[2018,4,1]],"date-time":"2018-04-01T00:00:00Z","timestamp":1522540800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"}],"funder":[{"name":"Academia Sinica Thematic Research","award":["AS-105-TP-C02-1"],"award-info":[{"award-number":["AS-105-TP-C02-1"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Emerg. Top. Comput. Intell."],"published-print":{"date-parts":[[2018,4]]},"DOI":"10.1109\/tetci.2017.2784878","type":"journal-article","created":{"date-parts":[[2018,3,23]],"date-time":"2018-03-23T18:13:49Z","timestamp":1521828829000},"page":"117-128","source":"Crossref","is-referenced-by-count":204,"title":["Audio-Visual Speech Enhancement Using Multimodal Deep Convolutional Neural Networks"],"prefix":"10.1109","volume":"2","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4010-0220","authenticated-orcid":false,"given":"Jen-Cheng","family":"Hou","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6956-0418","authenticated-orcid":false,"given":"Syu-Siang","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Ying-Hui","family":"Lai","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Tsao","sequence":"additional","affiliation":[]},{"given":"Hsiu-Wen","family":"Chang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3599-5071","authenticated-orcid":false,"given":"Hsin-Min","family":"Wang","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1007\/s10772-016-9332-x"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2012.07.005"},{"key":"ref71","first-page":"2489","article-title":"AVICAR: Audio-visual speech corpus in a car environment","author":"lee","year":"0","journal-title":"Proc Int Conf Spoken Lang"},{"key":"ref70","article-title":"100 nonspeech environmental sounds","author":"hu","year":"2004"},{"key":"ref76","article-title":"Lecture 6: Overview of mini-batch gradient descent","author":"hinton","year":"2012","journal-title":"Coursera Lecture slides"},{"key":"ref77","author":"chollet","year":"2015"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/HSCMA.2011.5942412"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/SAM.2002.1191001"},{"key":"ref75","doi-asserted-by":"crossref","DOI":"10.1201\/b14529","author":"loizou","year":"2013","journal-title":"Speech Enhancement Theory and Practice"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6638354"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.2307\/3001913"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.213"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1038\/264746a0"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/MLSP.2017.8168119"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-211"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6637694"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1155\/S1110865702206083"},{"key":"ref36","first-page":"117","article-title":"Audiovisual speech recognition with missing or unreliable data","author":"kolossa","year":"0","journal-title":"Proc Int Conf Auditory-Visual Speech Process"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2003.817150"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-13015-5"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2014.06.002"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1023\/B:VISI.0000013087.49260.fb"},{"key":"ref61","article-title":"Development of Taiwan Mandarin hearing in noise test","author":"huang","year":"2005"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1023\/A:1007379606734"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-22482-4_11"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639012"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6854294"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/89.902276"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1985.1164550"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/72.750549"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1155\/2010\/962103"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/JSEN.2016.2525811"},{"key":"ref69","doi-asserted-by":"crossref","first-page":"7","DOI":"10.1109\/TASLP.2014.2364452","article-title":"A regression approach to speech enhancement based on deep neural networks","volume":"23","author":"xu","year":"2015","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"ref2","first-page":"3002","article-title":"An investigation of spectral restoration algorithms for deep neural networks based noise robust speech recognition","author":"li","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref1","author":"li","year":"2015","journal-title":"Robust Automatic Speech Recognition A Bridge to Practical Applications"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TNN.2002.1021887"},{"key":"ref22","first-page":"436","article-title":"Speech enhancement based on deep denoising autoencoder","author":"lu","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1049\/ip-vis:19960758"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2013.2291240"},{"key":"ref23","first-page":"885","article-title":"Ensemble modeling of denoising autoencoder for speech spectrum restoration","author":"lu","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2016.2628641"},{"key":"ref25","first-page":"2685","article-title":"Experiments on deep learning for speech denoising","author":"liu","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2013.2296173"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2016.7820732"},{"key":"ref59","first-page":"363","article-title":"The hearing-aid speech quality index (HASQI)","volume":"58","author":"kates","year":"2010","journal-title":"J Audio Eng Soc"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/TSA.2005.860851"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2114881"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2001.941023"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.5244\/C.29.41"},{"key":"ref54","first-page":"1409","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"0","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.239"},{"key":"ref52","article-title":"MultiModal hybrid deep neural network for speech enhancement","author":"wu","year":"2016"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TBME.2016.2613960"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1097\/AUD.0000000000000074"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953172"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1097\/AUD.0000000000000537"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-49127-9_43"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1996.543199"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1984.1164453"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2016.7846321"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TSA.2005.851927"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1145\/1389586.1389627"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2015.10.003"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-71505-4_12"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2016.2535357"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1016\/S0925-2312(03)00395-3"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.367"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICSLP.1996.607754"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ISM.Workshops.2007.47"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2010.04.009"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1121\/1.3571422"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2017.8281993"},{"key":"ref8","first-page":"111","article-title":"Noise reduction in hearing aids: An overview","volume":"38","author":"levitt","year":"2001","journal-title":"J Rehabil Res Develop"},{"key":"ref86","article-title":"End-to-end waveform utterance enhancement for direct evaluation metrics optimization by fully convolutional neural networks","author":"fu","year":"2017"},{"key":"ref7","author":"venema","year":"2006","journal-title":"Compression for Clinicians"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2006.872619"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2017.2711489"},{"key":"ref46","first-page":"1959","article-title":"Enhancing audio speech using visual speech features","author":"almajai","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref45","first-page":"ii-2025?ii-2028","article-title":"Noisy audio feature enhancement using audio-visual speech data","author":"goecke","year":"0","journal-title":"Proc IEEE Int Conf Acoust Speech Signal Process"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2007.04.008"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2096212"},{"key":"ref42","first-page":"689","article-title":"Multimodal deep learning","author":"ngiam","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2172427"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1121\/1.1358887"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178347"}],"container-title":["IEEE Transactions on Emerging Topics in Computational Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/7433297\/8323304\/08323326.pdf?arnumber=8323326","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,12]],"date-time":"2022-01-12T16:21:50Z","timestamp":1642004510000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/8323326\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,4]]},"references-count":87,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/tetci.2017.2784878","relation":{},"ISSN":["2471-285X"],"issn-type":[{"value":"2471-285X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018,4]]}}}