{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:26:35Z","timestamp":1740122795077,"version":"3.37.3"},"reference-count":61,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2023,5,25]],"date-time":"2023-05-25T00:00:00Z","timestamp":1684972800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,5,25]],"date-time":"2023-05-25T00:00:00Z","timestamp":1684972800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100004326","name":"Simon Fraser University","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004326","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100000155","name":"Social Sciences and Humanities Research Council of Canada","doi-asserted-by":"publisher","award":["435-2019-1065"],"award-info":[{"award-number":["435-2019-1065"]}],"id":[{"id":"10.13039\/501100000155","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2023,7]]},"DOI":"10.1007\/s10772-023-10030-3","type":"journal-article","created":{"date-parts":[[2023,5,25]],"date-time":"2023-05-25T05:01:38Z","timestamp":1684990898000},"page":"459-474","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Mouth2Audio: intelligible audio synthesis from videos with distinctive vowel articulation"],"prefix":"10.1007","volume":"26","author":[{"given":"Saurabh","family":"Garg","sequence":"first","affiliation":[]},{"given":"Haoyao","family":"Ruan","sequence":"additional","affiliation":[]},{"given":"Ghassan","family":"Hamarneh","sequence":"additional","affiliation":[]},{"given":"Dawn M.","family":"Behne","sequence":"additional","affiliation":[]},{"given":"Allard","family":"Jongman","sequence":"additional","affiliation":[]},{"given":"Joan","family":"Sereno","sequence":"additional","affiliation":[]},{"given":"Yue","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,5,25]]},"reference":[{"key":"10030_CR1","doi-asserted-by":"publisher","unstructured":"Akbari, H., Himani A., Cao, L. & Mesgarani, N. (2018). Lip2Audspec: Speech reconstruction from silent lip movements video. In 2018 IEEE international conference on acoustics, speech and signal processing (ICASSP), (pp. 2516\u20132520). https:\/\/doi.org\/10.1109\/icassp.2018.8461856.","DOI":"10.1109\/icassp.2018.8461856"},{"key":"10030_CR2","doi-asserted-by":"publisher","DOI":"10.1101\/481267","author":"GK Anumanchipalli","year":"2018","unstructured":"Anumanchipalli, G. K., Chartier, J., & Chang, E. F. (2018). Intelligible speech synthesis from neural decoding of spoken sentences. BioRxiv. https:\/\/doi.org\/10.1101\/481267","journal-title":"BioRxiv"},{"key":"10030_CR3","unstructured":"Assael, Y. M., Shillingford, B., Whiteson, S., & De Freitas, N. (2016). Lipnet: End-to-end sentence-level lipreading.\u00a0arXiv preprint arXiv:1611.01599."},{"issue":"1\u20134","key":"10030_CR4","doi-asserted-by":"crossref","first-page":"5","DOI":"10.1016\/j.specom.2004.10.011","volume":"44","author":"LE Bernstein","year":"2004","unstructured":"Bernstein, L. E., Auer, E. T., Jr., & Takayanagi, S. (2004). Auditory speech detection in noise enhanced by lipreading. Speech Communication, 44(1\u20134), 5\u201318.","journal-title":"Speech Communication"},{"issue":"9","key":"10030_CR5","first-page":"341","volume":"5","author":"P Boersma","year":"2001","unstructured":"Boersma, P. (2001). Praat, a system for doing phonetics by computer. Glot International, 5(9), 341\u2013345.","journal-title":"Glot International"},{"key":"10030_CR6","doi-asserted-by":"publisher","first-page":"325","DOI":"10.1016\/0167-6393(94)90026-4","volume":"14","author":"ZS Bond","year":"1994","unstructured":"Bond, Z. S., & Moore, T. J. (1994). A note on the acoustic-phonetic characteristics of inadvertently clear speech. Speech Communication, 14, 325\u2013337. https:\/\/doi.org\/10.1016\/0167-6393(94)90026-4","journal-title":"Speech Communication"},{"issue":"3\u20134","key":"10030_CR7","doi-asserted-by":"crossref","first-page":"255","DOI":"10.1016\/S0167-6393(96)00063-5","volume":"20","author":"AR Bradlow","year":"1996","unstructured":"Bradlow, A. R., Torretta, G. M., & Pisoni, D. B. (1996). Intelligibility of normal speech I: Global and fine-grained acoustic-phonetic talker characteristics. Speech Communication, 20(3\u20134), 255\u2013272.","journal-title":"Speech Communication"},{"issue":"1","key":"10030_CR8","doi-asserted-by":"crossref","first-page":"26","DOI":"10.1044\/1092-4388(2013\/12-0103)","volume":"57","author":"C Burris","year":"2014","unstructured":"Burris, C., Vorperian, H. K., Fourakis, M., Kent, R. D., & Bolt, D. M. (2014). Quantitative and descriptive comparison of four acoustic analysis systems: Vowel measurements. Journal of Speech, Language, and Hearing Research, 57(1), 26\u201345.","journal-title":"Journal of Speech, Language, and Hearing Research"},{"key":"10030_CR9","first-page":"158","volume":"32","author":"L Chen","year":"2019","unstructured":"Chen, L., Su, H., & Ji, Q. (2019). Deep structured prediction for facial landmark detection. Advances in Neural Information Processing Systems, 32, 158.","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"5","key":"10030_CR10","doi-asserted-by":"crossref","first-page":"2421","DOI":"10.1121\/1.2229005","volume":"120","author":"M Cooke","year":"2006","unstructured":"Cooke, M., Barker, J., Cunningham, S., & Shao, X. (2006). An audio-visual corpus for speech perception and automatic speech recognition. The Journal of the Acoustical Society of America, 120(5), 2421\u20132424.","journal-title":"The Journal of the Acoustical Society of America"},{"issue":"6","key":"10030_CR12","doi-asserted-by":"crossref","first-page":"3668","DOI":"10.1121\/1.1810292","volume":"116","author":"A Cutler","year":"2004","unstructured":"Cutler, A., Weber, A., Smits, R., & Cooper, N. (2004). Patterns of English phoneme confusions by native and non-native listeners. The Journal of the Acoustical Society of America, 116(6), 3668\u20133678.","journal-title":"The Journal of the Acoustical Society of America"},{"issue":"8","key":"10030_CR13","doi-asserted-by":"crossref","first-page":"698","DOI":"10.1109\/LSP.2004.831663","volume":"11","author":"S Dubnov","year":"2004","unstructured":"Dubnov, S. (2004). Generalization of spectral flatness measure for non-gaussian linear processes. IEEE Signal Processing Letters, 11(8), 698\u2013701.","journal-title":"IEEE Signal Processing Letters"},{"key":"10030_CR14","doi-asserted-by":"crossref","unstructured":"Ephrat, A., & Peleg, S. (2017). Vid2speech: speech reconstruction from silent video. In 2017 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 5095\u20135099). IEEE.","DOI":"10.1109\/ICASSP.2017.7953127"},{"key":"10030_CR15","unstructured":"Feng, S., Kudina, O., Halpern, B. M., & Scharenborg, O. (2021). Quantifying bias in automatic speech recognition.\u00a0arXiv preprint arXiv:2103.15122."},{"issue":"3","key":"10030_CR16","doi-asserted-by":"crossref","first-page":"779","DOI":"10.1044\/1092-4388(2011\/10-0342)","volume":"55","author":"SH Ferguson","year":"2012","unstructured":"Ferguson, S. H. (2012). Talker differences in clear and conversational speech: Vowel intelligibility for older adults with hearing loss. Journal of Speech Language and Hearing Research, 55(3), 779\u2013790.","journal-title":"Journal of Speech Language and Hearing Research"},{"key":"10030_CR17","doi-asserted-by":"crossref","first-page":"259","DOI":"10.1121\/1.1482078","volume":"112","author":"SH Ferguson","year":"2002","unstructured":"Ferguson, S. H., & Kewley-Port, D. (2002). Vowel intelligibility in clear and conversational speech for normal-hearing and hearing-impaired listeners. The Journal of the Acoustical Society of America, 112, 259\u2013271.","journal-title":"The Journal of the Acoustical Society of America"},{"key":"10030_CR18","doi-asserted-by":"crossref","first-page":"1241","DOI":"10.1044\/1092-4388(2007\/087)","volume":"50","author":"SH Ferguson","year":"2007","unstructured":"Ferguson, S. H., & Kewley-Port, D. (2007). Talker differences in clear and conversational speech: Acoustic characteristics of vowels. Journal of Speech Language and Hearing Research, 50, 1241\u20131255.","journal-title":"Journal of Speech Language and Hearing Research"},{"issue":"6","key":"10030_CR19","doi-asserted-by":"crossref","first-page":"3570","DOI":"10.1121\/1.4874596","volume":"135","author":"SH Ferguson","year":"2014","unstructured":"Ferguson, S. H., & Quen\u00e9, H. (2014). Acoustic correlates of vowel intelligibility in clear and conversational speech for young normal-hearing and elderly hearing-impaired listeners. The Journal of the Acoustical Society of America, 135(6), 3570\u20133584.","journal-title":"The Journal of the Acoustical Society of America"},{"key":"10030_CR20","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-319-40174-4","volume-title":"An Introduction to Silent Speech Interfaces","author":"J Freitas","year":"2017","unstructured":"Freitas, J., Teixeira, A., Dias, M. S., & Silva, S. (2017). An introduction\u00a0to silent speech interfaces. Springer."},{"issue":"3\u20134","key":"10030_CR21","doi-asserted-by":"crossref","first-page":"213","DOI":"10.1016\/S0167-6393(01)00012-7","volume":"37","author":"JP Gagn\u00e9","year":"2002","unstructured":"Gagn\u00e9, J. P., Rochette, A. J., & Charest, M. (2002). Auditory, visual and audiovisual clear speech. Speech Communication, 37(3\u20134), 213\u2013230.","journal-title":"Speech Communication"},{"issue":"3","key":"10030_CR22","first-page":"1720","volume":"144","author":"S Garg","year":"2019","unstructured":"Garg, S., Tang, L., Hamarneh, G., Jongman, A., Sereno, J. A., & Wang, Y. (2019). Computer-vision analysis shows different facial movements for the production of different Mandarin tones. The Journal of the Acoustical Society of America, 144(3), 1720\u20131720.","journal-title":"The Journal of the Acoustical Society of America"},{"key":"10030_CR23","doi-asserted-by":"crossref","first-page":"177995","DOI":"10.1109\/ACCESS.2020.3026579","volume":"8","author":"JA Gonzalez-Lopez","year":"2020","unstructured":"Gonzalez-Lopez, J. A., Gomez-Alanis, A., Do\u00f1as, J. M. M., P\u00e9rez-C\u00f3rdoba, J. L., & Gomez, A. M. (2020). Silent speech interfaces for speech restoration: A review. IEEE Access, 8, 177995\u2013178021.","journal-title":"IEEE Access"},{"issue":"2","key":"10030_CR24","doi-asserted-by":"crossref","first-page":"236","DOI":"10.1109\/TASSP.1984.1164317","volume":"32","author":"D Griffin","year":"1984","unstructured":"Griffin, D., & Lim, J. (1984). Signal estimation from modified short-time Fourier transform. IEEE Transactions on Acoustics, Speech, and Signal Processing, 32(2), 236\u2013243.","journal-title":"IEEE Transactions on Acoustics, Speech, and Signal Processing"},{"key":"10030_CR25","doi-asserted-by":"crossref","unstructured":"Harte, C., Sandler, M., & Gasser, M. (2006). Detecting harmonic change in musical audio. In Proceedings of the 1st ACM workshop on audio and music computing multimedia (pp. 21\u201326).","DOI":"10.1145\/1178723.1178727"},{"key":"10030_CR26","doi-asserted-by":"crossref","first-page":"35","DOI":"10.3389\/fnsys.2014.00035","volume":"8","author":"S Heald","year":"2014","unstructured":"Heald, S., & Nusbaum, H. C. (2014). Speech perception as an active cognitive process. Frontiers in Systems Neuroscience, 8, 35.","journal-title":"Frontiers in Systems Neuroscience"},{"key":"10030_CR27","doi-asserted-by":"crossref","first-page":"217","DOI":"10.3389\/fnins.2015.00217","volume":"9","author":"C Herff","year":"2015","unstructured":"Herff, C., Heger, D., De Pesters, A., Telaar, D., Brunner, P., Schalk, G., & Schultz, T. (2015). Brain-to-text: Decoding spoken phrases from phone representations in the brain. Frontiers in Neuroscience, 9, 217.","journal-title":"Frontiers in Neuroscience"},{"issue":"5","key":"10030_CR28","doi-asserted-by":"crossref","first-page":"3099","DOI":"10.1121\/1.411872","volume":"97","author":"J Hillenbrand","year":"1995","unstructured":"Hillenbrand, J., Getty, L. A., Clark, M. J., & Wheeler, K. (1995). Acoustic characteristics of American English vowels. The Journal of the Acoustical Society of America, 97(5), 3099\u20133111.","journal-title":"The Journal of the Acoustical Society of America"},{"issue":"4","key":"10030_CR29","doi-asserted-by":"crossref","first-page":"288","DOI":"10.1016\/j.specom.2009.11.004","volume":"52","author":"T Hueber","year":"2010","unstructured":"Hueber, T., Benaroya, E. L., Chollet, G., Denby, B., Dreyfus, G., & Stone, M. (2010). Development of a silent speech interface driven by ultrasound and optical images of the tongue and lips. Speech Communication, 52(4), 288\u2013300.","journal-title":"Speech Communication"},{"key":"10030_CR30","doi-asserted-by":"crossref","first-page":"1367","DOI":"10.1044\/1092-4388(2003\/106)","volume":"46","author":"A Jongman","year":"2003","unstructured":"Jongman, A., Wang, Y., & Kim, B. H. (2003). Contributions of semantic and facial information to perception of nonsibilant fricatives. Journal of Speech Language and Hearing Research, 46, 1367\u20131377.","journal-title":"Journal of Speech Language and Hearing Research"},{"key":"10030_CR31","doi-asserted-by":"crossref","unstructured":"Kawase, T., Hori, Y., Ogawa, T., Sakamoto, S., Suzuki, Y., & Katori, Y. (2015). Importance of Visual Cues in Hearing Restoration by Auditory Prosthesis. In\u00a0Interface Oral Health Science 2014\u00a0(pp. 119\u2013127). Springer","DOI":"10.1007\/978-4-431-55192-8_10"},{"issue":"2","key":"10030_CR32","doi-asserted-by":"crossref","first-page":"598","DOI":"10.1016\/j.csl.2013.02.002","volume":"28","author":"J Kim","year":"2014","unstructured":"Kim, J., & Davis, C. (2014). Comparing the consistency and distinctiveness of speech produced in quiet and in noise. Computer Speech & Language, 28(2), 598\u2013606.","journal-title":"Computer Speech & Language"},{"key":"10030_CR33","first-page":"1755","volume":"10","author":"DE King","year":"2009","unstructured":"King, D. E. (2009). Dlib-ml: A machine learning toolkit. The Journal of Machine Learning Research, 10, 1755\u20131758.","journal-title":"The Journal of Machine Learning Research"},{"issue":"11","key":"10030_CR34","first-page":"860","volume":"61","author":"MV Laitinen","year":"2013","unstructured":"Laitinen, M. V., Disch, S., & Pulkki, V. (2013). Sensitivity of human hearing to changes in phase spectrum. Journal of the Audio Engineering Society, 61(11), 860\u2013877.","journal-title":"Journal of the Audio Engineering Society"},{"key":"10030_CR61","doi-asserted-by":"publisher","unstructured":"Lam, Jennifer, Tjaden, Kris, & Wilding, Greg (2012). Acoustics of clear speech: Effect of instruction. Journal of Speech Language and Hearing Research 55(6), 1807\u20131821. https:\/\/doi.org\/10.1044\/1092-4388(2012\/11-0154)","DOI":"10.1044\/1092-4388(2012\/11-0154)"},{"key":"10030_CR11","doi-asserted-by":"crossref","unstructured":"Le Cornu, T., & Milner, B. (2015). Reconstructing intelligible audio speech from visual speech features. In\u00a0Interspeech\u00a0(pp. 3355\u20133359).","DOI":"10.21437\/Interspeech.2015-139"},{"issue":"4","key":"10030_CR35","doi-asserted-by":"crossref","first-page":"3335","DOI":"10.1121\/1.4970636","volume":"140","author":"KK Leung","year":"2016","unstructured":"Leung, K. K., Redmon, C., Wang, Y., Jongman, A., & Sereno, J. (2016). Cross-linguistic perception of clearly spoken English tense and lax vowels based on auditory, visual, and auditory-visual information. The Journal of the Acoustical Society of America, 140(4), 3335\u20133335.","journal-title":"The Journal of the Acoustical Society of America"},{"issue":"5","key":"10030_CR36","doi-asserted-by":"crossref","first-page":"3261","DOI":"10.1121\/1.2990705","volume":"124","author":"Y Lu","year":"2008","unstructured":"Lu, Y., & Cooke, M. (2008). Speech production modifications produced by competing talkers, babble, and stationary noise. The Journal of the Acoustical Society of America, 124(5), 3261\u20133275.","journal-title":"The Journal of the Acoustical Society of America"},{"key":"10030_CR37","doi-asserted-by":"crossref","first-page":"1114","DOI":"10.1121\/1.2821966","volume":"123","author":"K Maniwa","year":"2008","unstructured":"Maniwa, K., Jongman, A., & Wade, T. (2008). Perception of clear fricatives by normal-hearing and simulated hearing-impaired listeners. The Journal of the Acoustical Society of America, 123, 1114\u20131125.","journal-title":"The Journal of the Acoustical Society of America"},{"issue":"6","key":"10030_CR38","doi-asserted-by":"crossref","first-page":"3962","DOI":"10.1121\/1.2990715","volume":"125","author":"K Maniwa","year":"2009","unstructured":"Maniwa, K., Jongman, A., & Wade, T. (2009). Acoustic characteristics of clearly spoken English fricatives. The Journal of the Acoustical Society of America, 125(6), 3962\u20133973.","journal-title":"The Journal of the Acoustical Society of America"},{"key":"10030_CR40","doi-asserted-by":"crossref","unstructured":"Mira, R., Vougioukas, K., Ma, P., Petridis, S., Schuller, B. W., & Pantic, M. (2022). End-to-end video-to-speech synthesis using generative adversarial networks. In IEEE transactions on cybernetics. arXiv:2104.13332 [cs.LG]","DOI":"10.1109\/TCYB.2022.3162495"},{"issue":"2","key":"10030_CR41","doi-asserted-by":"crossref","first-page":"133","DOI":"10.1111\/j.0963-7214.2004.01502010.x","volume":"15","author":"KG Munhall","year":"2004","unstructured":"Munhall, K. G., Jones, J. A., Callan, D. E., Kuratate, T., & Vatikiotis-Bateson, E. (2004). Visual prosody and speech intelligibility: Head movement improves auditory speech perception. Psychological Science, 15(2), 133\u2013137.","journal-title":"Psychological Science"},{"issue":"2","key":"10030_CR42","doi-asserted-by":"crossref","first-page":"175","DOI":"10.1121\/1.1906875","volume":"24","author":"GE Peterson","year":"1952","unstructured":"Peterson, G. E., & Barney, H. L. (1952). Control methods used in a study of the vowels. The Journal of the Acoustical Society of America, 24(2), 175\u2013184.","journal-title":"The Journal of the Acoustical Society of America"},{"key":"10030_CR62","doi-asserted-by":"publisher","unstructured":"Picheny, M. A., Durlach, N. I., & Braida, L. D. (1986). Speaking clearly for the hard of hearing II. Journal of Speech Language and Hearing Research 29(4), 434\u2013446. https:\/\/doi.org\/10.1044\/jshr.2904.434","DOI":"10.1044\/jshr.2904.434"},{"key":"10030_CR43","doi-asserted-by":"crossref","unstructured":"Prajwal, K. R., Mukhopadhyay, R., Namboodiri, V. P., & Jawahar, C. V. (2020). Learning individual speaking styles for accurate lip to speech synthesis. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 13796\u201313805).","DOI":"10.1109\/CVPR42600.2020.01381"},{"key":"10030_CR44","doi-asserted-by":"crossref","DOI":"10.1016\/j.wocn.2020.100980","volume":"81","author":"C Redmon","year":"2020","unstructured":"Redmon, C., Leung, K., Wang, Y., McMurray, B., Jongman, A., & Sereno, J. A. (2020). Cross-linguistic perception of clearly spoken English tense and lax vowels based on auditory, visual, and auditory-visual information. Journal of Phonetics, 81, 100980.","journal-title":"Journal of Phonetics"},{"key":"10030_CR45","unstructured":"Roesler, L. (2013). Acoustic characteristics of tense and lax vowels across sentence position in clear speech. Unpublished Master\u2019s thesis, University of Wisconsin-Milwaukee"},{"key":"10030_CR46","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2022.104389","volume":"119","author":"N Saleem","year":"2022","unstructured":"Saleem, N., Gao, J., Irfan, M., Verdu, E., & Fuente, J. P. (2022). E2E\u2013V2SResNet: Deep residual convolutional neural networks for end-to-end video driven speech synthesis. Image and Vision Computing, 119, 104389.","journal-title":"Image and Vision Computing"},{"issue":"8","key":"10030_CR47","doi-asserted-by":"crossref","first-page":"1627","DOI":"10.1021\/ac60214a047","volume":"36","author":"A Savitzky","year":"1964","unstructured":"Savitzky, A., & Golay, M. J. (1964). Smoothing and differentiation of data by simplified least squares procedures. Analytical Chemistry, 36(8), 1627\u20131639.","journal-title":"Analytical Chemistry"},{"issue":"4","key":"10030_CR48","doi-asserted-by":"crossref","first-page":"341","DOI":"10.1016\/j.specom.2009.12.002","volume":"52","author":"T Schultz","year":"2010","unstructured":"Schultz, T., & Wand, M. (2010). Modeling coarticulation in EMG-based continuous speech recognition. Speech Communication, 52(4), 341\u2013353.","journal-title":"Speech Communication"},{"issue":"1","key":"10030_CR49","doi-asserted-by":"crossref","first-page":"236","DOI":"10.1111\/j.1749-818X.2008.00112.x","volume":"3","author":"R Smiljani\u0107","year":"2009","unstructured":"Smiljani\u0107, R., & Bradlow, A. R. (2009). Speaking and hearing clearly: Talker and listener factors in speaking style changes. Language and Linguistics Compass, 3(1), 236\u2013264.","journal-title":"Language and Linguistics Compass"},{"issue":"2","key":"10030_CR50","doi-asserted-by":"crossref","first-page":"212","DOI":"10.1121\/1.1907309","volume":"26","author":"WH Sumby","year":"1954","unstructured":"Sumby, W. H., & Pollack, I. (1954). Visual contribution to speech intelligibility in noise. The Journal of the Acoustical Society of America, 26(2), 212\u2013215.","journal-title":"The Journal of the Acoustical Society of America"},{"key":"10030_CR51","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/j.specom.2015.09.008","volume":"75","author":"LY Tang","year":"2015","unstructured":"Tang, L. Y., Hannah, B., Jongman, A., Sereno, J., Wang, Y., & Hamarneh, G. (2015). Examining visible articulatory features in clear and plain speech. Speech Communication, 75, 1\u201313.","journal-title":"Speech Communication"},{"key":"10030_CR52","doi-asserted-by":"crossref","first-page":"84","DOI":"10.1044\/1092-4388(2009\/08-0124)","volume":"53","author":"SM Tasko","year":"2010","unstructured":"Tasko, S. M., & Greilick, K. (2010). Acoustic and articulatory features of diphthong production: A speech clarity study. Journal of Speech Language and Hearing Research, 53, 84\u201399.","journal-title":"Journal of Speech Language and Hearing Research"},{"issue":"2","key":"10030_CR53","doi-asserted-by":"crossref","first-page":"244","DOI":"10.1016\/j.wocn.2006.03.002","volume":"35","author":"H Traunm\u00fcller","year":"2007","unstructured":"Traunm\u00fcller, H., & \u00d6hrstr\u00f6m, N. (2007). Audiovisual perception of openness and lip rounding in front vowels. Journal of Phonetics, 35(2), 244\u2013258.","journal-title":"Journal of Phonetics"},{"key":"10030_CR54","unstructured":"van den Oord, A., Dieleman, S., Zen, H., Simonyan, K., Vinyals, O., Graves, A., Kalchbrenner, N., Senior, A., & Kavukcuoglu, K. (2016). WaveNet: A generative model for raw audio. In Proceeding of 9th ISCA workshop on speech synthesis workshop (SSW 9), 125"},{"key":"10030_CR55","doi-asserted-by":"crossref","unstructured":"Vougioukas, K., Ma, P., Petridis, S., & Pantic, M. (2019). Video-driven speech reconstruction using generative adversarial networks.\u00a0arXiv preprint arXiv:1906.06301.","DOI":"10.21437\/Interspeech.2019-1445"},{"key":"10030_CR56","doi-asserted-by":"crossref","unstructured":"Wang, Disong, Yang, Shan, Su, Dan, Liu, Xunying, Yu, Dong & Meng, Helen. (2022). VCVTS: Multi-speaker video-to-speech synthesis via cross-modal knowledge transfer from voice conversion.","DOI":"10.1109\/ICASSP43922.2022.9747427"},{"issue":"1","key":"10030_CR57","doi-asserted-by":"crossref","first-page":"458","DOI":"10.1121\/1.427069","volume":"106","author":"CI Watson","year":"1999","unstructured":"Watson, C. I., & Harrington, J. (1999). Acoustic evidence for dynamic formant trajectories in Australian English vowels. The Journal of the Acoustical Society of America, 106(1), 458\u2013468.","journal-title":"The Journal of the Acoustical Society of America"},{"key":"10030_CR58","doi-asserted-by":"crossref","unstructured":"Xiong, W., Wu, L., Alleva, F., Droppo, J., Huang, X., & Stolcke, A. (2018). The Microsoft 2017 conversational speech recognition system. In 2018 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 5934\u20135938). IEEE.","DOI":"10.1109\/ICASSP.2018.8461870"},{"issue":"1\u20132","key":"10030_CR59","doi-asserted-by":"crossref","first-page":"23","DOI":"10.1016\/S0167-6393(98)00048-X","volume":"26","author":"H Yehia","year":"1998","unstructured":"Yehia, H., Rubin, P., & Vatikiotis-Bateson, E. (1998). Quantitative association of vocal-tract and facial behavior. Speech Communication, 26(1\u20132), 23\u201343.","journal-title":"Speech Communication"},{"issue":"10","key":"10030_CR60","doi-asserted-by":"crossref","first-page":"1499","DOI":"10.1109\/LSP.2016.2603342","volume":"23","author":"K Zhang","year":"2016","unstructured":"Zhang, K., Zhang, Z., Li, Z., & Qiao, Y. (2016). Joint face detection and alignment using multitask cascaded convolutional networks. IEEE Signal Processing Letters, 23(10), 1499\u20131503.","journal-title":"IEEE Signal Processing Letters"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-023-10030-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10772-023-10030-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-023-10030-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,7,31]],"date-time":"2023-07-31T11:16:28Z","timestamp":1690802188000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10772-023-10030-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,5,25]]},"references-count":61,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2023,7]]}},"alternative-id":["10030"],"URL":"https:\/\/doi.org\/10.1007\/s10772-023-10030-3","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"type":"print","value":"1381-2416"},{"type":"electronic","value":"1572-8110"}],"subject":[],"published":{"date-parts":[[2023,5,25]]},"assertion":[{"value":"6 October 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 April 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 May 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}