{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T08:06:52Z","timestamp":1761898012370,"version":"3.37.3"},"reference-count":34,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2024,12,10]],"date-time":"2024-12-10T00:00:00Z","timestamp":1733788800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,10]],"date-time":"2024-12-10T00:00:00Z","timestamp":1733788800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2025,1]]},"DOI":"10.1007\/s10489-024-06010-y","type":"journal-article","created":{"date-parts":[[2024,12,10]],"date-time":"2024-12-10T08:59:47Z","timestamp":1733821187000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Talking-head video generation with long short-term contextual semantics"],"prefix":"10.1007","volume":"55","author":[{"given":"Zhao","family":"Jing","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3675-8476","authenticated-orcid":false,"given":"Hongxia","family":"Bie","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiali","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhisong","family":"Bie","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jinxin","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jianwei","family":"Ren","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yichen","family":"Zhi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,12,10]]},"reference":[{"key":"6010_CR1","doi-asserted-by":"publisher","unstructured":"Sha T, Zhang W, Shen T, Li Z, Mei T (2023) Deep person generation: A survey from the perspective of face, pose, and cloth synthesis. ACM Comput Surv 55(12). https:\/\/doi.org\/10.1145\/3575656","DOI":"10.1145\/3575656"},{"key":"6010_CR2","doi-asserted-by":"crossref","unstructured":"Siarohin A, Lathuili\u00e8re S, Tulyakov S, Ricci E, Sebe N (2019) Animating arbitrary objects via deep motion transfer. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 2377\u20132386","DOI":"10.1109\/CVPR.2019.00248"},{"key":"6010_CR3","doi-asserted-by":"publisher","unstructured":"Xue H, Ling J, Tang A, Song L, Xie R, Zhang W (2023) High-fidelity face reenactment via identity-matched correspondence learning. ACM Trans Multimed Comput Commun Appl 19(3). https:\/\/doi.org\/10.1145\/3571857","DOI":"10.1145\/3571857"},{"issue":"1","key":"6010_CR4","doi-asserted-by":"publisher","first-page":"560","DOI":"10.1109\/TPAMI.2022.3155571","volume":"45","author":"Y Nirkin","year":"2023","unstructured":"Nirkin Y, Keller Y, Hassner T (2023) Fsganv 2: Improved subject agnostic face swapping and reenactment. IEEE Trans Pattern Anal Mach Intell 45(1):560\u2013575. https:\/\/doi.org\/10.1109\/TPAMI.2022.3155571","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"6010_CR5","doi-asserted-by":"crossref","unstructured":"Tao J, Wang B, Xu B, Ge T, Jiang Y, Li W, Duan L (2022) Structure-aware motion transfer with deformable anchor model. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 3637\u20133646","DOI":"10.1109\/CVPR52688.2022.00362"},{"key":"6010_CR6","doi-asserted-by":"crossref","unstructured":"Hong F-T, Shen L, Xu D (2023) Dagan++: Depth-aware generative adversarial network for talking head video generation. IEEE Trans Pattern Anal Mach Intell","DOI":"10.1109\/CVPR52688.2022.00339"},{"key":"6010_CR7","doi-asserted-by":"crossref","unstructured":"Rochow A, Schwarz M, Behnke S (2024) Fsrt: Facial scene representation transformer for face reenactment from factorized appearance head-pose and facial expression features. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 7716\u20137726","DOI":"10.1109\/CVPR52733.2024.00737"},{"issue":"4","key":"6010_CR8","doi-asserted-by":"publisher","first-page":"2734","DOI":"10.1109\/TCSVT.2023.3311039","volume":"34","author":"Z Sheng","year":"2024","unstructured":"Sheng Z, Nie L, Zhang M, Chang X, Yan Y (2024) Stochastic latent talking face generation toward emotional expressions and head poses. IEEE Trans Circ Syst Video Technol 34(4):2734\u20132748. https:\/\/doi.org\/10.1109\/TCSVT.2023.3311039","journal-title":"IEEE Trans Circ Syst Video Technol"},{"key":"6010_CR9","doi-asserted-by":"publisher","unstructured":"Bounareli S, Tzelepis C, Argyriou V, Patras I, Tzimiropoulos G (2024) One-shot neural face reenactment via finding directions in gan\u2019s latent space. Int J Comput Vis. https:\/\/doi.org\/10.1007\/s11263-024-02018-6","DOI":"10.1007\/s11263-024-02018-6"},{"key":"6010_CR10","unstructured":"Siarohin A, Lathuili\u00e8re S, Tulyakov S, Ricci E, Sebe N (2019) First order motion model for image animation. In: Proceedings of the 33rd international conference on neural information processing systems, pp 7137\u20137147"},{"key":"6010_CR11","doi-asserted-by":"crossref","unstructured":"Wang T-C, Mallya A, Liu M-Y (2021) One-shot free-view neural talking-head synthesis for video conferencing. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10039\u201310049","DOI":"10.1109\/CVPR46437.2021.00991"},{"key":"6010_CR12","doi-asserted-by":"crossref","unstructured":"Gao Y, Zhou Y, Wang J, Li X, Ming X, Lu Y: High-fidelity and freely controllable talking head video generation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 5609\u20135619","DOI":"10.1109\/CVPR52729.2023.00543"},{"issue":"4","key":"6010_CR13","doi-asserted-by":"publisher","first-page":"3313","DOI":"10.1109\/TKDE.2021.3130191","volume":"35","author":"J Gui","year":"2023","unstructured":"Gui J, Sun Z, Wen Y, Tao D, Ye J (2023) A review on generative adversarial networks: Algorithms, theory, and applications. IEEE Trans Knowl Data Eng 35(4):3313\u20133332. https:\/\/doi.org\/10.1109\/TKDE.2021.3130191","journal-title":"IEEE Trans Knowl Data Eng"},{"key":"6010_CR14","doi-asserted-by":"crossref","unstructured":"Zhang Y, Yu L, Sun B, He J (2022) Eng-face: cross-domain heterogeneous face synthesis with enhanced asymmetric cyclegan. Appl Intell 52(13):15295\u201315307","DOI":"10.1007\/s10489-022-03302-z"},{"issue":"2","key":"6010_CR15","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3487891","volume":"55","author":"N Aldausari","year":"2022","unstructured":"Aldausari N, Sowmya A, Marcus N, Mohammadi G (2022) Video generative adversarial networks: a review. ACM Comput Surv (CSUR) 55(2):1\u201325","journal-title":"ACM Comput Surv (CSUR)"},{"key":"6010_CR16","doi-asserted-by":"crossref","unstructured":"Tulyakov S, Liu M-Y, Yang X, Kautz J (2018) Mocogan: Decomposing motion and content for video generation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1526\u20131535","DOI":"10.1109\/CVPR.2018.00165"},{"key":"6010_CR17","doi-asserted-by":"publisher","first-page":"7250","DOI":"10.1109\/TMM.2024.3362149","volume":"26","author":"M Zhao","year":"2024","unstructured":"Zhao M, Wang W, Chen T, Zhang R, Li R (2024) Ta2v: Text-audio guided video generation. IEEE Trans Multimed 26:7250\u20137264. https:\/\/doi.org\/10.1109\/TMM.2024.3362149","journal-title":"IEEE Trans Multimed"},{"key":"6010_CR18","doi-asserted-by":"publisher","first-page":"9370","DOI":"10.1109\/TMM.2023.3251095","volume":"25","author":"J Zhu","year":"2023","unstructured":"Zhu J, Ma H, Chen J, Yuan J (2023) Motionvideogan: A novel video generator based on the motion space learned from image pairs. IEEE Trans Multimed 25:9370\u20139382. https:\/\/doi.org\/10.1109\/TMM.2023.3251095","journal-title":"IEEE Trans Multimed"},{"key":"6010_CR19","unstructured":"Wang T-C, Liu M-Y, Zhu J-Y, Liu G, Tao A, Kautz J, Catanzaro B (2018) Video-to-video synthesis. In: Proceedings of the 32nd international conference on neural information processing systems, pp 1152\u20131164"},{"key":"6010_CR20","unstructured":"Wang T-C, Liu M-Y, Tao A, Liu G, Kautz J, Catanzaro B (2019) Few-shot video-to-video synthesis. In: Proceedings of the 33rd international conference on neural information processing systems, pp 5013\u20135024"},{"key":"6010_CR21","doi-asserted-by":"crossref","unstructured":"Pan J, Wang C, Jia X, Shao J, Sheng L, Yan J, Wang X (2019) Video generation from single semantic label map. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 3733\u20133742","DOI":"10.1109\/CVPR.2019.00385"},{"issue":"12","key":"6010_CR22","doi-asserted-by":"publisher","first-page":"15946","DOI":"10.1007\/s10489-022-04352-z","volume":"53","author":"W Wan","year":"2023","unstructured":"Wan W, Yang Y, Huang S, Gan L (2023) Fran: feature-filtered residual attention network for realistic face sketch-to-photo transformation. Appl Intell 53(12):15946\u201315956","journal-title":"Appl Intell"},{"key":"6010_CR23","doi-asserted-by":"crossref","unstructured":"Grassal P-W, Prinzler M, Leistner T, Rother C, Nie\u00dfner M, Thies J (2022) Neural head avatars from monocular rgb videos. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 18653\u201318664","DOI":"10.1109\/CVPR52688.2022.01810"},{"key":"6010_CR24","doi-asserted-by":"crossref","unstructured":"Wiles O, Koepke A, Zisserman A (2018) X2face: A network for controlling face generation using images, audio, and pose codes. In: Proceedings of the european conference on computer vision (ECCV), pp 670\u2013686","DOI":"10.1007\/978-3-030-01261-8_41"},{"key":"6010_CR25","doi-asserted-by":"crossref","unstructured":"Zakharov E, Ivakhnenko A, Shysheya A, Lempitsky V (2020) Fast bi-layer neural synthesis of one-shot realistic head avatars. In: Computer vision\u2013ECCV 2020: 16th european conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XII 16, pp 524\u2013540. Springer","DOI":"10.1007\/978-3-030-58610-2_31"},{"key":"6010_CR26","doi-asserted-by":"crossref","unstructured":"Song L, Yin G, Liu B, Zhang Y, Yu N (2021) Fsft-net: face transfer video generation with few-shot views. In: 2021 IEEE international conference on image processing (ICIP), pp 3582\u20133586. IEEE","DOI":"10.1109\/ICIP42928.2021.9506512"},{"key":"6010_CR27","unstructured":"Lucas BD, Kanade T (1981) An iterative image registration technique with an application to stereo vision. In: IJCAI\u201981: 7th international joint conference on artificial intelligence, vol 2, pp 674\u2013679"},{"key":"6010_CR28","unstructured":"Jakab T, Gupta A, Bilen H, Vedaldi A (2018) Unsupervised learning of object landmarks through conditional image generation. In: Proceedings of the 32nd international conference on neural information processing systems, pp 4020\u20134031"},{"key":"6010_CR29","doi-asserted-by":"crossref","unstructured":"Johnson J, Alahi A, Fei-Fei L (2016) Perceptual losses for real-time style transfer and super-resolution. In: Computer vision\u2013ECCV 2016: 14th european conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part II 14, pp 694\u2013711. Springer","DOI":"10.1007\/978-3-319-46475-6_43"},{"key":"6010_CR30","doi-asserted-by":"crossref","unstructured":"Nagrani A, Chung JS, Zisserman A (2017) Voxceleb: A large-scale speaker identification dataset. Interspeech 2017","DOI":"10.21437\/Interspeech.2017-950"},{"key":"6010_CR31","doi-asserted-by":"crossref","unstructured":"Chung J, Nagrani A, Zisserman A (2018) Voxceleb2: Deep speaker recognition. Interspeech 2018","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"6010_CR32","doi-asserted-by":"crossref","unstructured":"Zhang R, Isola P, Efros AA, Shechtman E, Wang O (2018) The unreasonable effectiveness of deep features as a perceptual metric. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 586\u2013595","DOI":"10.1109\/CVPR.2018.00068"},{"key":"6010_CR33","unstructured":"Heusel M, Ramsauer H, Unterthiner T, Nessler B, Hochreiter S (2017) Gans trained by a two time-scale update rule converge to a local nash equilibrium. In: Proceedings of the 31st international conference on neural information processing systems, pp 6629\u20136640"},{"key":"6010_CR34","doi-asserted-by":"crossref","unstructured":"Bulat A, Tzimiropoulos G (2017) How far are we from solving the 2d & 3d face alignment problem?(and a dataset of 230,000 3d facial landmarks). In: Proceedings of the IEEE international conference on computer vision, pp 1021\u20131030","DOI":"10.1109\/ICCV.2017.116"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-024-06010-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-024-06010-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-024-06010-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,20]],"date-time":"2025-01-20T15:06:00Z","timestamp":1737385560000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-024-06010-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,10]]},"references-count":34,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,1]]}},"alternative-id":["6010"],"URL":"https:\/\/doi.org\/10.1007\/s10489-024-06010-y","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"type":"print","value":"0924-669X"},{"type":"electronic","value":"1573-7497"}],"subject":[],"published":{"date-parts":[[2024,12,10]]},"assertion":[{"value":"30 September 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 December 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing of interest"}},{"value":"Informed consent was obtained from all participants involved in the study.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent"}}],"article-number":"120"}}