{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,18]],"date-time":"2026-01-18T11:23:03Z","timestamp":1768735383991,"version":"3.49.0"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2021,2,22]],"date-time":"2021-02-22T00:00:00Z","timestamp":1613952000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,2,22]],"date-time":"2021-02-22T00:00:00Z","timestamp":1613952000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61761166005"],"award-info":[{"award-number":["61761166005"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004663","name":"Ministry of Science and Technology, Taiwan","doi-asserted-by":"publisher","award":["MOST 106-2218-E-032-003-MY3"],"award-info":[{"award-number":["MOST 106-2218-E-032-003-MY3"]}],"id":[{"id":"10.13039\/501100004663","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004731","name":"Natural Science Foundation of Zhejiang Province","doi-asserted-by":"publisher","award":["LY20F020007"],"award-info":[{"award-number":["LY20F020007"]}],"id":[{"id":"10.13039\/501100004731","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Ningbo Science and Technology Planning Project","award":["2019B10032"],"award-info":[{"award-number":["2019B10032"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2022,3]]},"DOI":"10.1007\/s00371-021-02074-w","type":"journal-article","created":{"date-parts":[[2021,2,22]],"date-time":"2021-02-22T16:03:26Z","timestamp":1614009806000},"page":"1151-1164","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":31,"title":["Facial expression GAN for voice-driven face generation"],"prefix":"10.1007","volume":"38","author":[{"given":"Zheng","family":"Fang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1806-5027","authenticated-orcid":false,"given":"Zhen","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tingting","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chih-Chieh","family":"Hung","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiangjian","family":"Xiao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guangjin","family":"Feng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2021,2,22]]},"reference":[{"key":"2074_CR1","doi-asserted-by":"crossref","unstructured":"Sriram, A., Jun, H., Gaur, Y., Satheesh, S.: Robust speech recognition using generative adversarial networks. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5639\u20135643 (2018)","DOI":"10.1109\/ICASSP.2018.8462456"},{"key":"2074_CR2","unstructured":"Dumpala, S.H., Sheikh, I., Chakraborty, R., Kopparapu, S.K.: A Cycle-GAN approach to model natural perturbations in speech for ASR applications. arXiv preprint arXiv:1912.11151 (2019)"},{"key":"2074_CR3","doi-asserted-by":"crossref","unstructured":"Dai, B., Fidler, S., Urtasun, R., Lin, D.: Towards diverse and natural image descriptions via a conditional gan. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2970\u20132979 (2017)","DOI":"10.1109\/ICCV.2017.323"},{"key":"2074_CR4","doi-asserted-by":"crossref","unstructured":"Chen, C., Mu, S., Xiao, W., Ye, Z., Wu, L., Ju, Q.: Improving image captioning with conditional generative adversarial nets. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 8142\u20138150 (2019)","DOI":"10.1609\/aaai.v33i01.33018142"},{"key":"2074_CR5","unstructured":"Goodfellow, I., Pougetabadie, J., Mirza, M., Xu, B., Wardefarley, D., Ozair, S., Courville, A., Bengio, Y.: Generative adversarial nets. In: Advances in Neural Information Processing Systems, pp. 2672\u20132680 (2014)"},{"key":"2074_CR6","doi-asserted-by":"publisher","first-page":"98","DOI":"10.1016\/j.inffus.2017.02.003","volume":"37","author":"S Poria","year":"2017","unstructured":"Poria, S., Cambria, E., Bajpai, R., Hussain, A.: A review of affective computing: from unimodal analysis to multimodal fusion. Inf. Fusion 37, 98\u2013125 (2017)","journal-title":"Inf. Fusion"},{"key":"2074_CR7","doi-asserted-by":"crossref","unstructured":"Han, F., Guerrero, R., Pavlovic, V.: CookGAN: meal image synthesis from ingredients. Computer Vision and Pattern Recognition. arXiv (2020)","DOI":"10.1109\/WACV45572.2020.9093463"},{"key":"2074_CR8","doi-asserted-by":"crossref","unstructured":"Nasir, O.R., Jha, S.K., Grover, M.S., Yu, Y., Kumar, A., Shah, R.R.: Text2FaceGAN: face generation from fine grained textual descriptions. In: IEEE International Conference on Multimedia Big Data, pp. 58\u201367 (2019)","DOI":"10.1109\/BigMM.2019.00-42"},{"key":"2074_CR9","unstructured":"Qiu, Y., Kataoka, H.: Image generation associated with music data. In: Computer Vision and Pattern Recognition (CVPR), pp. 2510\u20132513 (2018)"},{"key":"2074_CR10","doi-asserted-by":"crossref","unstructured":"Wan, C., Chuang, S., Lee, H.: Towards audio to scene image synthesis using generative adversarial network. In: International Conference on Acoustics Speech and Signal Processing (ICASSP), pp. 496\u2013500 (2019)","DOI":"10.1109\/ICASSP.2019.8682383"},{"key":"2074_CR11","doi-asserted-by":"crossref","unstructured":"Isola, P., Zhu, J., Zhou, T., Efros, A.A.: Image-to-image translation with conditional adversarial networks. In: Computer Vision and Pattern Recognition (CVPR), pp. 5967\u20135976 (2017)","DOI":"10.1109\/CVPR.2017.632"},{"key":"2074_CR12","doi-asserted-by":"crossref","unstructured":"Duarte, A., Roldan, F., Tubau, M., Escur, J., Pascual, S., Salvador, A., Mohedano, E., Mcguinness, K., Torres, J., Giroinieto, X.: Wav2Pix: speech-conditioned face generation using generative adversarial networks. In: International Conference on Acoustics Speech and Signal Processing (ICASSP), pp. 8633\u20138637 (2019)","DOI":"10.1109\/ICASSP.2019.8682970"},{"key":"2074_CR13","doi-asserted-by":"crossref","unstructured":"Oh, T., Dekel, T., Kim, C., Mosseri, I., Freeman, W.T., Rubinstein, M., Matusik, W.: Speech2Face: learning the face behind a voice. In: Computer Vision and Pattern Recognition (CVPR), pp. 7539\u20137548 (2019)","DOI":"10.1109\/CVPR.2019.00772"},{"key":"2074_CR14","unstructured":"Wen, Y., Singh, R., Raj, B.: Face reconstruction from voice using generative adversarial networks. In: Advances in Neural Information Processing Systems (NIPS), pp. 5265\u20135274 (2019)"},{"key":"2074_CR15","unstructured":"Odena, A., Olah, C., Shlens, J.: Conditional image synthesis with auxiliary classifier GANs. In: International Conference on Machine Learning, pp. 2642\u20132651 (2017)"},{"issue":"3","key":"2074_CR16","doi-asserted-by":"publisher","first-page":"868","DOI":"10.3758\/s13414-015-1045-8","volume":"78","author":"HMJ Smith","year":"2016","unstructured":"Smith, H.M.J., Dunn, A.K., Baguley, T., Stacey, P.C.: Matching novel face and voice identity using static and dynamic facial images. Atten. Percept. Psychophys. 78(3), 868\u2013879 (2016)","journal-title":"Atten. Percept. Psychophys."},{"key":"2074_CR17","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Albanie, S., Zisserman, A.: Seeing voices and hearing faces: cross-modal biometric matching. In: Computer Vision and Pattern Recognition (CVPR), pp. 8427\u20138436 (2018)","DOI":"10.1109\/CVPR.2018.00879"},{"issue":"5","key":"2074_CR18","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1371\/journal.pone.0196391","volume":"13","author":"SR Livingstone","year":"2018","unstructured":"Livingstone, S.R., Russo, F.A.: The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS): a dynamic, multimodal set of facial and vocal expressions in North American English. PLOS ONE 13(5), 1\u201335 (2018)","journal-title":"PLOS ONE"},{"key":"2074_CR19","doi-asserted-by":"crossref","unstructured":"Martin, O., Kotsia, I., Macq, B., Pitas, I.: The eNTERFACE\u201905 audio-visual emotion database. In: 22nd International Conference on Data Engineering Workshops (ICDEW\u201906), pp. 8\u20138. IEEE Computer Society (2006)","DOI":"10.1109\/ICDEW.2006.145"},{"key":"2074_CR20","unstructured":"Nguyen, T.D., Le, T., Vu, H., Phung, D.: Dual discriminator generative adversarial nets. In: Advances in Neural Information Processing Systems (NIPS), pp. 2670\u20132680 (2017)"},{"key":"2074_CR21","unstructured":"Durugkar, I., Gemp, I., Mahadevan, S.: Generative multi-adversarial networks. In: International Conference on Learning Representations (2017)"},{"key":"2074_CR22","doi-asserted-by":"crossref","unstructured":"Chen, L., Maddox, R.K., Duan, Z., Xu, C.: Hierarchical cross-modal talking face generation with dynamic pixel-wise loss. In: Computer Vision and Pattern Recognition (CVPR), pp. 7832\u20137841 (2019)","DOI":"10.1109\/CVPR.2019.00802"},{"key":"2074_CR23","unstructured":"Chung, J.S., Jamaludin, A., Zisserman, A.: You said that? In: British Machine Vision Conference (BMVC) (2017)"},{"key":"2074_CR24","doi-asserted-by":"crossref","unstructured":"Vougioukas, K., Petridis, S., Pantic, M.: End-to-end speech-driven facial animation with temporal GANs. In: British Machine Vision Conference (BMVC) (2018)","DOI":"10.1007\/s11263-019-01251-8"},{"issue":"5","key":"2074_CR25","first-page":"1398","volume":"8","author":"V Konstantinos","year":"2020","unstructured":"Konstantinos, V., Stavros, P., Maja, P.: Realistic speech-driven facial animation with GANs. Int. J. Comput. Vis. 8(5), 1398\u20131413 (2020)","journal-title":"Int. J. Comput. Vis."},{"issue":"8","key":"2074_CR26","doi-asserted-by":"publisher","first-page":"1240","DOI":"10.1109\/JSTSP.2017.2763455","volume":"11","author":"S Watanabe","year":"2017","unstructured":"Watanabe, S., Kim, S., Hershey, J.R., Hori, T.: Hybrid CTC\/attention architecture for end-to-end speech recognition. IEEE J. Sel. Top. Signal Process. 11(8), 1240\u20131253 (2017)","journal-title":"IEEE J. Sel. Top. Signal Process."},{"key":"2074_CR27","doi-asserted-by":"crossref","unstructured":"Chandrasekar, P., Chapaneri, S., Jayaswal, D.: Automatic speech emotion recognition: a survey. In: International Conference on Circuits, pp. 341\u2013346 (2014)","DOI":"10.1109\/CSCITA.2014.6839284"},{"issue":"1","key":"2074_CR28","doi-asserted-by":"publisher","first-page":"1261","DOI":"10.1515\/jisys-2018-0372","volume":"29","author":"V Passricha","year":"2019","unstructured":"Passricha, V., Aggarwal, R.K.: A hybrid of deep CNN and bidirectional LSTM for automatic speech recognition. J. Intell. Syst. 29(1), 1261\u20131274 (2019)","journal-title":"J. Intell. Syst."},{"issue":"1","key":"2074_CR29","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1109\/TPAMI.2008.52","volume":"31","author":"Z Zeng","year":"2009","unstructured":"Zeng, Z., Pantic, M., Roisman, G.I., Huang, T.S.: A survey of affect recognition methods: audio, visual, and spontaneous expressions. IEEE Trans. Pattern Anal. Mach. Intell. 31(1), 39\u201358 (2009)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2074_CR30","doi-asserted-by":"crossref","unstructured":"Aldeneh, Z., Provost, E.M.: Using regional saliency for speech emotion recognition. In: International Conference on Acoustics, Speech and Signal Processing, pp. 2741\u20132745 (2017)","DOI":"10.1109\/ICASSP.2017.7952655"},{"issue":"11","key":"2074_CR31","first-page":"1","volume":"6","author":"F Chenchah","year":"2015","unstructured":"Chenchah, F., Lachiri, Z.: Acoustic emotion recognition using linear and nonlinear cepstral coefficients. Int. J. Adv. Comput. Sci. Appl. 6(11), 1\u20134 (2015)","journal-title":"Int. J. Adv. Comput. Sci. Appl."},{"key":"2074_CR32","unstructured":"Waghmare, V.B., Deshmukh, R.R., Shrishrimal, P.P., Janvale, G.B., Ambedkar, B.B.: Emotion recognition system from artificial Marathi speech using MFCC and LDA techniques. In: International Conference on Advances in Communication, Network, and Computing (2014)"},{"issue":"11","key":"2074_CR33","doi-asserted-by":"publisher","first-page":"1675","DOI":"10.1109\/TASLP.2019.2925934","volume":"27","author":"Y Xie","year":"2019","unstructured":"Xie, Y., Liang, R., Liang, Z., Huang, C., Zou, C., Schuller, B.: Speech emotion classification using attention-based LSTM. IEEE Trans. Audio Speech Lang. Process. 27(11), 1675\u20131685 (2019)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"2074_CR34","unstructured":"Huang, Z., Dong, M., Mao, Q., Zhan, Y.: Speech emotion recognition using CNN. In: the Proceedings of the 22nd ACM international conference on Multimedia, pp. 801\u2013804"},{"key":"2074_CR35","unstructured":"Yi, R., Ye, Z., Zhang, J., Bao, H., Liu, Y.: Audio-driven talking face video generation with learning-based personalized head pose. arXiv preprint arXiv:2002.10137 (2020)"},{"issue":"4","key":"2074_CR36","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073640","volume":"36","author":"S Suwajanakorn","year":"2017","unstructured":"Suwajanakorn, S., Seitz, S.M., Kemelmachershlizerman, I.: Synthesizing Obama: learning lip sync from audio. ACM Trans. Graph. (TOG) 36(4), 1\u201313 (2017)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"2074_CR37","unstructured":"Jalalifar, S.A., Hasani, H., Aghajan, H.: Speech-driven facial reenactment using conditional generative adversarial networks. arXiv preprint arXiv:1803.07461 (2018)"},{"key":"2074_CR38","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2019.2916031","author":"N Sadoughi","year":"2019","unstructured":"Sadoughi, N., Busso, C.: Speech-driven expressive talking lips with conditional sequential generative adversarial networks. IEEE Trans. Affect. Comput. (2019). https:\/\/doi.org\/10.1109\/TAFFC.2019.2916031","journal-title":"IEEE Trans. Affect. Comput."},{"key":"2074_CR39","unstructured":"Duan, B., Wang, W., Tang, H., Latapie, H., Yan, Y.: Cascade attention guided residue learning GAN for cross-modal translation. arXiv preprint arXiv:1907.01826 (2019)"},{"key":"2074_CR40","doi-asserted-by":"crossref","unstructured":"Van Segbroeck, M., Tsiartas, A., Narayanan, S.S.: A robust frontend for VAD: exploiting contextual, discriminative and spectral cues of human voice. In: Conference of the International Speech Communication Association, pp. 704\u2013708 (2013)","DOI":"10.21437\/Interspeech.2013-198"},{"key":"2074_CR41","first-page":"1755","volume":"10","author":"DE King","year":"2009","unstructured":"King, D.E.: Dlib-ml: a machine learning toolkit. J. Mach. Learn. Res. 10, 1755\u20131758 (2009)","journal-title":"J. Mach. Learn. Res."},{"key":"2074_CR42","unstructured":"Gulrajani, I., Ahmed, F., Arjovsky, M., Dumoulin, V., Courville, A.C.: Improved training of Wasserstein Gans. In: Advances in Neural Information Processing Systems, pp. 5767\u20135777 (2017)"},{"key":"2074_CR43","unstructured":"Radford, A., Metz, L., Chintala, S.: Unsupervised representation learning with deep convolutional generative adversarial networks. In: International Conference on Learning Representations (2016)"},{"key":"2074_CR44","unstructured":"Gan, Z., Chen, L., Wang, W., Pu, Y., Zhang, Y., Liu, H., Li, C., Carin, L.: Triangle generative adversarial networks. In: Advances in Neural Information Processing Systems, pp. 5247\u20135256 (2017)"},{"key":"2074_CR45","unstructured":"Li, C., Xu, K., Zhu, J., Zhang, B.: Triple generative adversarial nets. In: Advances in Neural Information Processing Systems, pp. 4088\u20134098 (2017)"},{"key":"2074_CR46","unstructured":"Salimans, T., Goodfellow, I., Zaremba, W., Cheung, V., Radford, A., Chen, X.: Improved techniques for training GANs. In: Neural Information Processing Systems, pp. 2234\u20132242 (2016)"},{"key":"2074_CR47","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local nash equilibrium. In: Neural Information Processing Systems, pp. 6626\u20136637 (2017)"},{"key":"2074_CR48","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z.: Rethinking the inception architecture for computer vision. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2818\u20132826 (2016)","DOI":"10.1109\/CVPR.2016.308"},{"key":"2074_CR49","doi-asserted-by":"crossref","unstructured":"Parkhi, O.M., Vedaldi, A., Zisserman, A.: Deep face recognition. In: British Machine Vision Conference (2015)","DOI":"10.5244\/C.29.41"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-021-02074-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-021-02074-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-021-02074-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,3,10]],"date-time":"2022-03-10T12:17:42Z","timestamp":1646914662000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-021-02074-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,2,22]]},"references-count":49,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2022,3]]}},"alternative-id":["2074"],"URL":"https:\/\/doi.org\/10.1007\/s00371-021-02074-w","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,2,22]]},"assertion":[{"value":"25 January 2021","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 February 2021","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Compliance with ethical standards"}},{"value":"All authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}