{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T17:49:31Z","timestamp":1772905771839,"version":"3.50.1"},"reference-count":51,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2024,12,13]],"date-time":"2024-12-13T00:00:00Z","timestamp":1734048000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,13]],"date-time":"2024-12-13T00:00:00Z","timestamp":1734048000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2025,1]]},"DOI":"10.1007\/s10489-024-05914-z","type":"journal-article","created":{"date-parts":[[2024,12,13]],"date-time":"2024-12-13T06:43:22Z","timestamp":1734072202000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["HyperLips: hyper control lips with high resolution decoder for talking face generation"],"prefix":"10.1007","volume":"55","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7212-1755","authenticated-orcid":false,"given":"Yaosen","family":"Chen","sequence":"first","affiliation":[]},{"given":"Yu","family":"Yao","sequence":"additional","affiliation":[]},{"given":"Zhiqiang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Wei","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Yanru","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Han","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Xuming","family":"Wen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,13]]},"reference":[{"issue":"12","key":"5914_CR1","doi-asserted-by":"publisher","first-page":"8717","DOI":"10.1109\/TPAMI.2018.2889052","volume":"44","author":"T Afouras","year":"2018","unstructured":"Afouras T, Chung JS, Senior A, Vinyals O, Zisserman A (2018) Deep audio-visual speech recognition. IEEE Trans Pattern Anal Mach Intell 44(12):8717\u20138727","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"5914_CR2","unstructured":"Afouras T, Chung JS, Senior A, Vinyals O, Zisserman A (2018) Deep audio-visual speech recognition. In: arXiv:1809.02108"},{"key":"5914_CR3","doi-asserted-by":"crossref","unstructured":"Bigioi D, Basak S, Jordan H, McDonnell R, Corcoran P (2023) Speech driven video editing via an audio-conditioned diffusion model. arXiv:2301.04474","DOI":"10.1016\/j.imavis.2024.104911"},{"key":"5914_CR4","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown T, Mann B, Ryder N, Subbiah M, Kaplan JD, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A et al (2020) Language models are few-shot learners. Adv Neural Inf Process Syst 33:1877\u20131901","journal-title":"Adv Neural Inf Process Syst"},{"key":"5914_CR5","doi-asserted-by":"crossref","unstructured":"Chatziagapi A, Athar S, Jain A, Rohith M, Bhat V, Samaras D (2023) Lipnerf: What is the right feature space to lip-sync a nerf? In: 2023 IEEE 17th International conference on automatic face and gesture recognition (FG), IEEE, pp 1\u20138","DOI":"10.1109\/FG57933.2023.10042567"},{"key":"5914_CR6","doi-asserted-by":"crossref","unstructured":"Chen L, Maddox RK, Duan Z, Xu C (2019) Hierarchical cross-modal talking face generation with dynamic pixel-wise loss. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 7832\u20137841","DOI":"10.1109\/CVPR.2019.00802"},{"key":"5914_CR7","doi-asserted-by":"publisher","first-page":"104144","DOI":"10.1016\/j.imavis.2021.104144","volume":"109","author":"Y Chen","year":"2021","unstructured":"Chen Y, Guo B, Shen Y, Wang W, Lu W, Suo X (2021) Boundary graph convolutional network for temporal action detection. Image Vis Comput 109:104144","journal-title":"Image Vis Comput"},{"issue":"5","key":"5914_CR8","doi-asserted-by":"publisher","first-page":"2962","DOI":"10.1109\/TCSVT.2021.3104226","volume":"32","author":"Y Chen","year":"2021","unstructured":"Chen Y, Guo B, Shen Y, Wang W, Lu W, Suo X (2021) Capsule boundary network with 3d convolutional dynamic routing for temporal action detection. IEEE Trans Circuits Syst Video Technol 32(5):2962\u20132975","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"issue":"15","key":"5914_CR9","doi-asserted-by":"publisher","first-page":"17864","DOI":"10.1007\/s10489-022-03451-1","volume":"52","author":"Y Chen","year":"2022","unstructured":"Chen Y, Guo B, Shen Y, Zhou R, Lu W, Wang W, Wen X, Suo X (2022) Video summarization with u-shaped transformer. Appl Intell 52(15):17864\u201317880","journal-title":"Appl Intell"},{"key":"5914_CR10","doi-asserted-by":"crossref","unstructured":"Chen Y, Tai Y, Liu X, Shen C, Yang J (2018) Fsrnet: End-to-end learning face super-resolution with facial priors. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 2492\u20132501","DOI":"10.1109\/CVPR.2018.00264"},{"key":"5914_CR11","unstructured":"Chen Y, Yuan Q, Li Z, Xie C, Liu Y, Wang W, Wen X, Yu Q (2022) Upst-nerf: Universal photorealistic style transfer of neural radiance fields for 3d scene. arXiv:2208.07059"},{"issue":"1","key":"5914_CR12","doi-asserted-by":"publisher","first-page":"72","DOI":"10.1038\/s41597-024-02918-9","volume":"11","author":"Z Chen","year":"2024","unstructured":"Chen Z, Yang J, Feng Z, Zhu H (2024) Railfod23: A dataset for foreign object detection on railroad transmission lines. Sci Data 11(1):72","journal-title":"Sci Data"},{"key":"5914_CR13","doi-asserted-by":"crossref","unstructured":"Chiang PZ, Tsai MS, Tseng HY, Lai WS, Chiu WC (2022) Stylizing 3d scene via implicit representation and hypernetwork. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp 1475\u20131484","DOI":"10.1109\/WACV51458.2022.00029"},{"key":"5914_CR14","doi-asserted-by":"crossref","unstructured":"Chung JS, Zisserman A (2016) Out of time: automated lip sync in the wild. In: Asian conference on computer vision. Springer, pp 251\u2013263","DOI":"10.1007\/978-3-319-54427-4_19"},{"key":"5914_CR15","doi-asserted-by":"publisher","first-page":"3480","DOI":"10.1109\/TMM.2021.3099900","volume":"24","author":"SE Eskimez","year":"2021","unstructured":"Eskimez SE, Zhang Y, Duan Z (2021) Speech driven talking face generation from a single image and an emotion condition. IEEE Trans Multimedia 24:3480\u20133490","journal-title":"IEEE Trans Multimedia"},{"key":"5914_CR16","doi-asserted-by":"crossref","unstructured":"Guo Y, Chen K, Liang S, Liu YJ, Bao H, Zhang J (2021) Ad-nerf: Audio driven neural radiance fields for talking head synthesis. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 5784\u20135794","DOI":"10.1109\/ICCV48922.2021.00573"},{"key":"5914_CR17","unstructured":"Ha D, Dai A, Le QV (2017) Hypernetworks. In: ICLR"},{"key":"5914_CR18","unstructured":"Heusel M, Ramsauer H, Unterthiner T, Nessler B, Hochreiter S (2017) Gans trained by a two time-scale update rule converge to a local nash equilibrium. Adv Neural Inf Process Syst 30"},{"key":"5914_CR19","doi-asserted-by":"crossref","unstructured":"Howard A, Sandler M, Chu G, Chen LC, Chen B, Tan M, Wang W, Zhu Y, Pang R, Vasudevan V et al (2019) Searching for mobilenetv3. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp 1314\u20131324","DOI":"10.1109\/ICCV.2019.00140"},{"key":"5914_CR20","doi-asserted-by":"crossref","unstructured":"Karras T, Laine S, Aila T (2019) A style-based generator architecture for generative adversarial networks. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 4401\u20134410","DOI":"10.1109\/CVPR.2019.00453"},{"key":"5914_CR21","first-page":"1755","volume":"10","author":"DE King","year":"2009","unstructured":"King DE (2009) Dlib-ml: A machine learning toolkit. J Mach Learn Res 10:1755\u20131758","journal-title":"J Mach Learn Res"},{"key":"5914_CR22","unstructured":"KR P, Mukhopadhyay R, Philip J, Jha A, Namboodiri V, Jawahar C (2019) Towards automatic face-to-face translation. In: Proceedings of the 27th ACM international conference on multimedia, pp 1428\u20131436"},{"key":"5914_CR23","unstructured":"Lugaresi C, Tang J, Nash H, McClanahan C, Uboweja E, Hays M, Zhang F, Chang CL, Yong MG, Lee J et al (2019) Mediapipe: A framework for building perception pipelines. arXiv:1906.08172"},{"issue":"1","key":"5914_CR24","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1145\/3503250","volume":"65","author":"B Mildenhall","year":"2021","unstructured":"Mildenhall B, Srinivasan PP, Tancik M, Barron JT, Ramamoorthi R, Ng R (2021) Nerf: Representing scenes as neural radiance fields for view synthesis. Commun ACM 65(1):99\u2013106","journal-title":"Commun ACM"},{"key":"5914_CR25","doi-asserted-by":"crossref","unstructured":"Park SJ, Kim M, Hong J, Choi J, Ro YM (2022) Synctalkface: Talking face generation with precise lip-syncing via audio-lip memory. In: Proceedings of the AAAI conference on artificial intelligence, vol\u00a036, pp 2062\u20132070","DOI":"10.1609\/aaai.v36i2.20102"},{"key":"5914_CR26","unstructured":"Peng B, Li C, He P, Galley M, Gao J (2023) Instruction tuning with gpt-4. arXiv:2304.03277"},{"key":"5914_CR27","doi-asserted-by":"crossref","unstructured":"Prajwal K, Mukhopadhyay R, Namboodiri VP, Jawahar C (2020) A lip sync expert is all you need for speech to lip generation in the wild. In: Proceedings of the 28th ACM international conference on multimedia, pp 484\u2013492","DOI":"10.1145\/3394171.3413532"},{"key":"5914_CR28","unstructured":"Radford A, Narasimhan K, Salimans T, Sutskever I et al (2018) Improving language understanding by generative pre-training"},{"issue":"8","key":"5914_CR29","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford A, Wu J, Child R, Luan D, Amodei D, Sutskever I et al (2019) Language models are unsupervised multitask learners. OpenAI blog 1(8):9","journal-title":"OpenAI blog"},{"key":"5914_CR30","doi-asserted-by":"crossref","unstructured":"Ravichandran S, Texler O, Dinev D, Kang HJ (2023) Synthesizing photorealistic virtual humans through cross-modal disentanglement. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 4585\u20134594","DOI":"10.1109\/CVPR52729.2023.00445"},{"key":"5914_CR31","doi-asserted-by":"crossref","unstructured":"Rombach R, Blattmann A, Lorenz D, Esser P, Ommer B (2022) High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10684\u201310695","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"5914_CR32","doi-asserted-by":"crossref","unstructured":"Sandler M, Howard A, Zhu M, Zhmoginov A, Chen LC (2018) Mobilenetv2: Inverted residuals and linear bottlenecks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4510\u20134520","DOI":"10.1109\/CVPR.2018.00474"},{"key":"5914_CR33","doi-asserted-by":"crossref","unstructured":"Shen Z, Lai WS, Xu T, Kautz J, Yang MH (2018) Deep semantic face deblurring. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 8260\u20138269","DOI":"10.1109\/CVPR.2018.00862"},{"key":"5914_CR34","unstructured":"Simonyan K, Zisserman A (2014) Very deep convolutional networks for large-scale image recognition. arXiv:1409.1556"},{"key":"5914_CR35","unstructured":"Tang J, Wang K, Zhou H, Chen X, He D, Hu T, Liu J, Zeng G, Wang J (2022) Real-time neural radiance talking portrait synthesis via audio-spatial decomposition. arXiv:2211.12368"},{"key":"5914_CR36","doi-asserted-by":"crossref","unstructured":"Thies J, Elgharib M, Tewari A, Theobalt C, Nie\u00dfner M (2020) Neural voice puppetry: Audio-driven facial reenactment. In: European conference on computer vision. Springer, pp 716\u2013731","DOI":"10.1007\/978-3-030-58517-4_42"},{"key":"5914_CR37","doi-asserted-by":"crossref","unstructured":"Toshpulatov M, Lee W, Lee S (2023) Talking human face generation: a survey. Expert Syst Appl 119678","DOI":"10.1016\/j.eswa.2023.119678"},{"key":"5914_CR38","doi-asserted-by":"crossref","unstructured":"Wang K, Wu Q, Song L, Yang Z, Wu W, Qian C, He R, Qiao Y, Loy CC (2020) Mead: a large-scale audio-visual dataset for emotional talking-face generation. In: European conference on computer vision. Springer, pp 700\u2013717","DOI":"10.1007\/978-3-030-58589-1_42"},{"key":"5914_CR39","unstructured":"Wang T, Zhang K, Chen X, Luo W, Deng J, Lu T, Cao X, Liu W, Li H, Zafeiriou S (2022) A survey of deep face restoration: Denoise, super-resolution, deblur, artifact removal. arXiv:2211.02831"},{"issue":"4","key":"5914_CR40","doi-asserted-by":"publisher","first-page":"600","DOI":"10.1109\/TIP.2003.819861","volume":"13","author":"Z Wang","year":"2004","unstructured":"Wang Z, Bovik AC, Sheikh HR, Simoncelli EP (2004) Image quality assessment: from error visibility to structural similarity. IEEE Trans Image Process 13(4):600\u2013612","journal-title":"IEEE Trans Image Process"},{"issue":"3","key":"5914_CR41","doi-asserted-by":"publisher","first-page":"500","DOI":"10.1109\/TMM.2006.888009","volume":"9","author":"L Xie","year":"2007","unstructured":"Xie L, Liu ZQ (2007) Realistic mouth-synching for speech-driven talking face using articulatory modelling. IEEE Trans Multimedia 9(3):500\u2013510","journal-title":"IEEE Trans Multimedia"},{"key":"5914_CR42","unstructured":"Ye Z, He J, Jiang Z, Huang R, Huang J, Liu J, Ren Y, Yin X, Ma Z, Zhao Z (2023) Geneface++: Generalized and stable real-time audio-driven 3d talking face generation. arXiv:2305.00787"},{"key":"5914_CR43","unstructured":"Ye Z, Jiang Z, Ren Y, Liu J, He J, Zhao Z (2023) Geneface: Generalized and high-fidelity audio-driven 3d talking face synthesis. arXiv:2301.13430"},{"key":"5914_CR44","doi-asserted-by":"crossref","unstructured":"Ye Z, Xia M, Yi R, Zhang J, Lai YK, Huang X, Zhang G, Liu YJ (2022) Audio-driven talking face video generation with dynamic convolution kernels. IEEE Trans Multimed","DOI":"10.1109\/TMM.2022.3142387"},{"key":"5914_CR45","doi-asserted-by":"crossref","unstructured":"Yin Y, Robinson J, Zhang Y, Fu Y (2020) Joint super-resolution and alignment of tiny faces. In: Proceedings of the AAAI conference on artificial intelligence, vol\u00a034, pp 12693\u201312700","DOI":"10.1609\/aaai.v34i07.6962"},{"key":"5914_CR46","doi-asserted-by":"publisher","first-page":"2950","DOI":"10.1109\/TMM.2021.3091863","volume":"24","author":"L Yu","year":"2021","unstructured":"Yu L, Xie H, Zhang Y (2021) Multimodal learning for temporally coherent talking face generation with articulator synergy. IEEE Trans Multimed 24:2950\u20132962","journal-title":"IEEE Trans Multimed"},{"key":"5914_CR47","doi-asserted-by":"crossref","unstructured":"Zhang R, Isola P, Efros AA, Shechtman E, Wang O (2018) The unreasonable effectiveness of deep features as a perceptual metric. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 586\u2013595","DOI":"10.1109\/CVPR.2018.00068"},{"key":"5914_CR48","doi-asserted-by":"crossref","unstructured":"Zhang W, Cun X, Wang X, Zhang Y, Shen X, Guo Y, Shan Y, Wang F (2023) Sadtalker: Learning realistic 3d motion coefficients for stylized audio-driven single image talking face animation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 8652\u20138661","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"5914_CR49","doi-asserted-by":"crossref","unstructured":"Zhang Z, Hu Z, Deng W, Fan C, Lv T, Ding Y (2023) Dinet: Deformation inpainting network for realistic face visually dubbing on high resolution video. In: Proceedings of the AAAI conference on artificial intelligence, vol\u00a037, pp 3543\u20133551","DOI":"10.1609\/aaai.v37i3.25464"},{"key":"5914_CR50","doi-asserted-by":"crossref","unstructured":"Zhong W, Fang C, Cai Y, Wei P, Zhao G, Lin L, Li G (2023) Identity-preserving talking face generation with landmark and appearance priors. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 9729\u20139738","DOI":"10.1109\/CVPR52729.2023.00938"},{"key":"5914_CR51","doi-asserted-by":"crossref","unstructured":"Zhou H, Sun Y, Wu W, Loy CC, Wang X, Liu Z (2021) Pose-controllable talking face generation by implicitly modularized audio-visual representation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 4176\u20134186","DOI":"10.1109\/CVPR46437.2021.00416"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-024-05914-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-024-05914-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-024-05914-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,20]],"date-time":"2025-01-20T15:07:18Z","timestamp":1737385638000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-024-05914-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,13]]},"references-count":51,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,1]]}},"alternative-id":["5914"],"URL":"https:\/\/doi.org\/10.1007\/s10489-024-05914-z","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"value":"0924-669X","type":"print"},{"value":"1573-7497","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,13]]},"assertion":[{"value":"22 October 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 December 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"All data used came from public datasets. No additional personal data was collected.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical and informed consent for data used"}}],"article-number":"145"}}