{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,19]],"date-time":"2026-01-19T08:38:31Z","timestamp":1768811911318,"version":"3.49.0"},"reference-count":27,"publisher":"Springer Science and Business Media LLC","issue":"9","license":[{"start":{"date-parts":[[2015,11,3]],"date-time":"2015-11-03T00:00:00Z","timestamp":1446508800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2016,5]]},"DOI":"10.1007\/s11042-015-3038-y","type":"journal-article","created":{"date-parts":[[2015,11,3]],"date-time":"2015-11-03T11:38:02Z","timestamp":1446550682000},"page":"5223-5245","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Mapping ultrasound-based articulatory images and vowel sounds with a deep neural network framework"],"prefix":"10.1007","volume":"75","author":[{"given":"Jianguo","family":"Wei","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qiang","family":"Fang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinyuan","family":"Zheng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenhuan","family":"Lu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuqing","family":"He","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jianwu","family":"Dang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2015,11,3]]},"reference":[{"key":"3038_CR1","doi-asserted-by":"crossref","unstructured":"Badino L, Canevari C, Fadiga L, Metta G (2012) Deep-level acoustic-to-articulatory mapping for DBN-HMM based phone recognition. In Spoken Language Technology Workshop (SLT). IEEE 370\u2013375","DOI":"10.1109\/SLT.2012.6424252"},{"key":"3038_CR2","doi-asserted-by":"crossref","unstructured":"Ben-Youssef A, Shimodaira H, Braude D (2014) Speech driven talking head from estimated articulatory features. In Acoustics, Speech and Signal Processing (ICASSP). IEEE Int Conf IEEE 4573\u20134577","DOI":"10.1109\/ICASSP.2014.6854468"},{"key":"3038_CR3","doi-asserted-by":"crossref","unstructured":"Ghosh PK, Narayanan SS (2011) A subject-independent acoustic-to-articulatory inversion. In Acoustics, Speech and Signal Processing (ICASSP). IEEE Int Conf IEEE 4624\u20134627","DOI":"10.1109\/ICASSP.2011.5947385"},{"issue":"1","key":"3038_CR4","first-page":"926","volume":"9","author":"G Hinton","year":"2010","unstructured":"Hinton G (2010) A practical guide to training restricted Boltzmann machines. Momentum 9(1):926","journal-title":"Momentum"},{"issue":"2","key":"3038_CR5","doi-asserted-by":"crossref","first-page":"175","DOI":"10.1109\/TSA.2003.822636","volume":"12","author":"S Hiroya","year":"2004","unstructured":"Hiroya S, Honda M (2004) Estimation of articulatory movements from speech acoustics using an HMM-based speech production model. IEEE Trans Speech Audio Process 12(2):175\u2013185","journal-title":"IEEE Trans Speech Audio Process"},{"key":"3038_CR6","doi-asserted-by":"crossref","unstructured":"Hodgen J, Valdez P (2001) A stochastic articulatory-to-acoustic mapping as a basis for speech recognition. In Instrumentation and Measurement Technology Conference, 2001. IMTC 2001. Proceedings of the 18th IEEE. IEEE 2:1105\u20131110","DOI":"10.1109\/IMTC.2001.928251"},{"issue":"3","key":"3038_CR7","doi-asserted-by":"crossref","first-page":"1819","DOI":"10.1121\/1.416001","volume":"100","author":"J Hogden","year":"1996","unstructured":"Hogden J, Lofqvist A, Gracco V, Zlokarnik I, Rubin P, Saltzman E (1996) Accurate recovery of articulator positions from acoustics: new conclusions based on human data. J Acoust Soc Am 100(3):1819\u20131834","journal-title":"J Acoust Soc Am"},{"key":"3038_CR8","doi-asserted-by":"crossref","unstructured":"Huang J, Kingsbury B (2013) Audio-visual deep learning for noise robust speech recognition. In Acoustics, Speech and Signal Processing (ICASSP). IEEE Int Conf IEEE 7596\u20137599","DOI":"10.1109\/ICASSP.2013.6639140"},{"issue":"4","key":"3038_CR9","doi-asserted-by":"crossref","first-page":"2354","DOI":"10.1121\/1.1715112","volume":"116","author":"CT Kello","year":"2004","unstructured":"Kello CT, Plaut DC (2004) A neural network model of the articulatory-acoustic forward mapping trained on recordings of articulatory parameters. J Acoust Soc Am 116(4):2354\u20132364","journal-title":"J Acoust Soc Am"},{"issue":"4","key":"3038_CR10","doi-asserted-by":"crossref","first-page":"2354","DOI":"10.1121\/1.1715112","volume":"116","author":"CT Kello","year":"2004","unstructured":"Kello CT, Plaut DC (2004) A neural network model of the articulatory-acoustic forward mapping trained on recordings of articulatory parameters. J Acoust Soc Am 116(4):2354\u20132364","journal-title":"J Acoust Soc Am"},{"key":"3038_CR11","doi-asserted-by":"crossref","unstructured":"Ladefoged P (1980) What are linguistic sounds made of? Language 485\u2013502","DOI":"10.2307\/414446"},{"key":"3038_CR12","doi-asserted-by":"crossref","unstructured":"Livescu K, Cetin O, Hasegawa-Johnson M, King S, Bartels C, Borges N, Saenko K (2007) Articulatory feature-based methods for acoustic and audio-visual speech recognition: Summary from the 2006 JHU summer workshop. In Acoustics, Speech and Signal Processing, 2007. ICASSP 2007. IEEE Int Conf IEEE 4:IV-621","DOI":"10.1109\/ICASSP.2007.366989"},{"key":"3038_CR13","doi-asserted-by":"crossref","unstructured":"Nakamura K, Toda T, Nankaku Y, Tokuda K (2006) On the use of phonetic information for mapping from articulatory movements to vocal tract spectrum. In Acoustics, Speech and Signal Processing, 2006. ICASSP 2006 Proceedings. IEEE Int Conf IEEE 1:I-I","DOI":"10.1109\/ICASSP.2006.1659965"},{"key":"3038_CR14","doi-asserted-by":"crossref","unstructured":"Nefian AV, Liang L, Pi X, Xiaoxiang L, Mao C, Murphy K (2002) A coupled HMM for audio-visual speech recognition. In Acoustics, Speech, and Signal Processing (ICASSP). IEEE Int Conf IEEE 2:II-2013","DOI":"10.1109\/ICASSP.2002.5745027"},{"key":"3038_CR15","unstructured":"Ngiam J, Khosla A, Kim M, Nam J, Lee H, Ng AY (2011) Multimodal deep learning. In Proceedings of the 28th international conference on machine learning (ICML-11) 689\u2013696"},{"issue":"3","key":"3038_CR16","doi-asserted-by":"crossref","first-page":"423","DOI":"10.1109\/TASL.2008.2011515","volume":"17","author":"G Papandreou","year":"2009","unstructured":"Papandreou G, Katsamanis A, Pitsikalis V, Maragos P (2009) Adaptive multimodal fusion by uncertainty compensation with application to audiovisual speech recognition. IEEE Trans Audio Speech Lang Process 17(3):423\u2013435","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"3038_CR17","doi-asserted-by":"crossref","unstructured":"Richmond K (2006) A trajectory mixture density network for the acoustic-articulatory inversion mapping. In Interspeech","DOI":"10.21437\/Interspeech.2006-213"},{"issue":"2","key":"3038_CR18","doi-asserted-by":"crossref","first-page":"153","DOI":"10.1016\/S0885-2308(03)00005-6","volume":"17","author":"K Richmond","year":"2003","unstructured":"Richmond K, King S, Taylor P (2003) Modelling the uncertainty in recovering articulation from acoustics. Comput Speech Lang 17(2):153\u2013172","journal-title":"Comput Speech Lang"},{"key":"3038_CR19","doi-asserted-by":"crossref","unstructured":"Saenko K, Darrell T, Glass JR (2004) Articulatory features for robust visual speech recognition. In Proceedings of the 6th international conference on Multimodal interfaces. ACM 152\u2013158","DOI":"10.1145\/1027933.1027960"},{"key":"3038_CR20","unstructured":"Schroeter J, Sondhi MM (1992) Speech coding based on physiological models of speech production. Advances Speech Signal Process 231\u2013267"},{"issue":"1","key":"3038_CR21","doi-asserted-by":"crossref","first-page":"133","DOI":"10.1109\/89.260356","volume":"2","author":"J Schroeter","year":"1994","unstructured":"Schroeter J, Sondhi MM (1994) Techniques for estimating vocal-tract shapes from the speech signal. IEEE Trans Speech Audio Process 2(1):133\u2013150","journal-title":"IEEE Trans Speech Audio Process"},{"key":"3038_CR22","doi-asserted-by":"crossref","unstructured":"Simko J, Cummins F (2009) Sequencing embodied gestures in speech","DOI":"10.1049\/PBCE071E_ch11"},{"key":"3038_CR23","doi-asserted-by":"crossref","unstructured":"Suzuki S, Okadome T, Honda M (1998) Determination of articulatory positions from speech acoustics by applying dynamic articulatory constraints. In ICSLP","DOI":"10.21437\/ICSLP.1998-559"},{"issue":"3","key":"3038_CR24","doi-asserted-by":"crossref","first-page":"215","DOI":"10.1016\/j.specom.2007.09.001","volume":"50","author":"T Toda","year":"2008","unstructured":"Toda T, Black AW, Tokuda K (2008) Statistical mapping between articulatory movements and acoustic spectrum using a Gaussian mixture model. Speech Commun 50(3):215\u2013227","journal-title":"Speech Commun"},{"key":"3038_CR25","first-page":"3371","volume":"11","author":"P Vincent","year":"2010","unstructured":"Vincent P, Larochelle H, Lajoie I, Bengio Y, Manzagol PA (2010) Stacked denoising autoencoders: learning useful representations in a deep network with a local denoising criterion. J Mach Learn Res 11:3371\u20133408","journal-title":"J Mach Learn Res"},{"key":"3038_CR26","doi-asserted-by":"crossref","unstructured":"Wang L, Qian X, Han W, Soong FK (2010) Synthesizing photo-real talking head via trajectory-guided sample selection. In INTERSPEECH 10:446\u2013449","DOI":"10.21437\/Interspeech.2010-194"},{"issue":"3","key":"3038_CR27","doi-asserted-by":"crossref","first-page":"500","DOI":"10.1109\/TMM.2006.888009","volume":"9","author":"L Xie","year":"2007","unstructured":"Xie L, Liu ZQ (2007) Realistic mouth-synching for speech-driven talking face using articulatory modelling. IEEE Trans Multimedia 9(3):500\u2013510","journal-title":"IEEE Trans Multimedia"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-015-3038-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11042-015-3038-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-015-3038-y","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,15]],"date-time":"2023-08-15T18:34:53Z","timestamp":1692124493000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11042-015-3038-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,11,3]]},"references-count":27,"journal-issue":{"issue":"9","published-print":{"date-parts":[[2016,5]]}},"alternative-id":["3038"],"URL":"https:\/\/doi.org\/10.1007\/s11042-015-3038-y","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"value":"1380-7501","type":"print"},{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2015,11,3]]}}}