{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T15:52:53Z","timestamp":1773330773365,"version":"3.50.1"},"reference-count":48,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/3.0\/legalcode"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2018]]},"DOI":"10.1109\/access.2018.2796118","type":"journal-article","created":{"date-parts":[[2018,2,2]],"date-time":"2018-02-02T19:20:00Z","timestamp":1517599200000},"page":"5573-5583","source":"Crossref","is-referenced-by-count":36,"title":["Auxiliary Loss Multimodal GRU Model in Audio-Visual Speech Recognition"],"prefix":"10.1109","volume":"6","author":[{"given":"Yuan","family":"Yuan","sequence":"first","affiliation":[]},{"given":"Chunlin","family":"Tian","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7037-5188","authenticated-orcid":false,"given":"Xiaoqiang","family":"Lu","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","author":"chung","year":"2014","journal-title":"Empirical evaluation of gated recurrent neural networks on sequence modeling"},{"key":"ref38","first-page":"1","article-title":"Learning phrase representations using RNN encoder&#x2013;decoder for statistical machine translation","volume":"abs 1406 1078","author":"cho","year":"2015","journal-title":"CoRR"},{"key":"ref33","first-page":"1","article-title":"Conditional generative adversarial nets","volume":"abs 1411 1784","author":"mirza","year":"2014","journal-title":"CoRR"},{"key":"ref32","first-page":"1","article-title":"Auto-encoding variational Bayes","volume":"abs 1312 6114","author":"kingma","year":"2013","journal-title":"CoRR"},{"key":"ref31","first-page":"2672","article-title":"Generative adversarial nets","author":"goodfellow","year":"2014","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2017.2703598"},{"key":"ref37","author":"mao","year":"2014","journal-title":"Deep captioning with multimodal recurrent neural networks (m-rnn)"},{"key":"ref36","author":"arjovsky","year":"2017","journal-title":"Wasserstein GAN"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.310"},{"key":"ref34","first-page":"1","article-title":"Unsupervised representation learning with deep convolutional generative adversarial networks","volume":"abs 1511 6434","author":"radford","year":"2015","journal-title":"CoRR"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/89.326616"},{"key":"ref40","first-page":"1","article-title":"Visualizing and understanding recurrent networks","volume":"abs 1506 2078","author":"karpathy","year":"2015","journal-title":"CoRR"},{"key":"ref11","year":"2005","journal-title":"Speech processing transmission and quality aspects (STQ) Distributed speech recognition Advanced front-end feature extraction algorithm Compression algorithms"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2017.2728801"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2016.2607778"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639038"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2005.1511828"},{"key":"ref16","doi-asserted-by":"crossref","DOI":"10.1201\/b14529","author":"loizou","year":"2013","journal-title":"Speech Enhancement Theory and Practice"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2012.6288962"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2016.11.007"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8268912"},{"key":"ref28","author":"goodfellow","year":"2016","journal-title":"Deep Learning"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.389"},{"key":"ref27","first-page":"1601","article-title":"The recurrent temporal restricted Boltzmann machine","author":"sutskever","year":"2008","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref3","first-page":"689","article-title":"Multimodal deep learning","author":"ngiam","year":"2011","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2017.2702596"},{"key":"ref5","first-page":"1764","article-title":"Towards end-to-end speech recognition with recurrent neural networks","author":"graves","year":"2014","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953077"},{"key":"ref7","first-page":"173","article-title":"Deep speech 2: End-to-end speech recognition in english and mandarin","author":"amodei","year":"2016","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref2","first-page":"1","article-title":"Learning representations for multimodal data with deep belief nets","author":"srivastava","year":"2012","journal-title":"Proc Int Machine Learning Workshop"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639100"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neuroimage.2011.07.024"},{"key":"ref46","article-title":"Human action recognition using deep probabilistic graphical models","author":"wu","year":"2014"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1007\/978-0-387-76316-3"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2014.6836053"},{"key":"ref48","first-page":"1","article-title":"Very deep convolutional networks for large-scale image recognition","volume":"abs 1409 1556","author":"simonyan","year":"2014","journal-title":"CoRR"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1155\/S1110865702206083"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2001.990517"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/s00530-010-0182-0"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2009.2030637"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178347"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/34.982900"},{"key":"ref23","first-page":"250","article-title":"Hidden Markov models in speech recognition","volume":"7","author":"krajcovic","year":"2008","journal-title":"Adv Electr Electron Eng"},{"key":"ref44","author":"bouwmans","year":"2016","journal-title":"On the role and the importance of features for background modeling and foreground detection"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2015.7415335"},{"key":"ref43","first-page":"32","article-title":"An HOG-LBP human detector with partial occlusion handling","author":"wang","year":"2010","journal-title":"Proc IEEE Int Conf Comput Vis"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-014-0629-7"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6287639\/8274985\/08279447.pdf?arnumber=8279447","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,12]],"date-time":"2022-01-12T16:08:43Z","timestamp":1642003723000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8279447\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"references-count":48,"URL":"https:\/\/doi.org\/10.1109\/access.2018.2796118","relation":{},"ISSN":["2169-3536"],"issn-type":[{"value":"2169-3536","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]}}}