{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T09:11:27Z","timestamp":1774602687372,"version":"3.50.1"},"reference-count":48,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T00:00:00Z","timestamp":1770336000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T00:00:00Z","timestamp":1770336000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 92370119"],"award-info":[{"award-number":["No. 92370119"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 62376113"],"award-info":[{"award-number":["No. 62376113"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s11263-025-02695-x","type":"journal-article","created":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T04:54:35Z","timestamp":1770353675000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Unlock Pose Diversity: Accurate and Efficient Implicit Keypoint-based Spatiotemporal Diffusion for Audio-driven Talking Portrait"],"prefix":"10.1007","volume":"134","author":[{"given":"Chaolong","family":"Yang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kai","family":"Yao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuyao","family":"Yan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chenru","family":"Jiang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Weiguang","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jie","family":"Sun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guangliang","family":"Cheng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yifei","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bin","family":"Dong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kaizhu","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,2,6]]},"reference":[{"key":"2695_CR1","doi-asserted-by":"crossref","unstructured":"Blanz, V., & Vetter, T. (1999). A morphable model for the synthesis of 3d faces. In: Proceedings of the 26th Annual Conference on Computer Graphics and Interactive Techniques, (pp. 187\u2013194)","DOI":"10.1145\/311535.311556"},{"key":"2695_CR2","doi-asserted-by":"crossref","unstructured":"Chen, L., Maddox, R. K., Duan, Z., & Xu, C. (2019). Hierarchical cross-modal talking face generation with dynamic pixel-wise loss. In: CVPR, pp. 7832\u20137841","DOI":"10.1109\/CVPR.2019.00802"},{"key":"2695_CR3","doi-asserted-by":"publisher","first-page":"2403","DOI":"10.1609\/aaai.v39i3.32241","volume":"39","author":"Z Chen","year":"2025","unstructured":"Chen, Z., Cao, J., Chen, Z., Li, Y., & Ma, C. (2025). Echomimic: Lifelike audio-driven portrait animations through editable landmark conditions. AAAI,39, 2403\u20132410.","journal-title":"AAAI"},{"key":"2695_CR4","doi-asserted-by":"crossref","unstructured":"Cheng, K., Cun, X., Zhang, Y., Xia, M., Yin, F., Zhu, M., Wang, X., Wang, J., & Wang, N. (2022). Videoretalking: Audio-based lip synchronization for talking head video editing in the wild. In: SIGGRAPH ASIA, pp. 1\u20139","DOI":"10.1145\/3550469.3555399"},{"key":"2695_CR5","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Xue, N., & Zafeiriou, S. (2019). Arcface: Additive angular margin loss for deep face recognition. In: CVPR, pp. 4690\u20134699","DOI":"10.1109\/CVPR.2019.00482"},{"key":"2695_CR6","doi-asserted-by":"crossref","unstructured":"Doukas, M.C., Zafeiriou, S., & Sharmanska, V. (2021). Headgan: One-shot neural head synthesis and editing. In: ICCV, pp. 14398\u201314407","DOI":"10.1109\/ICCV48922.2021.01413"},{"issue":"11","key":"2695_CR7","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1145\/3422622","volume":"63","author":"I Goodfellow","year":"2020","unstructured":"Goodfellow, I., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., Courville, A., & Bengio, Y. (2020). Generative adversarial networks. Communications of the ACM,63(11), 139\u2013144.","journal-title":"Communications of the ACM"},{"key":"2695_CR8","unstructured":"Guo, J., Zhang, D., Liu, X., Zhong, Z., Zhang, Y., Wan, P., & Zhang, D. (2024) Liveportrait: Efficient portrait animation with stitching and retargeting control. arXiv preprint arXiv:2407.03168"},{"key":"2695_CR9","doi-asserted-by":"crossref","unstructured":"Gururani, S., Mallya, A., Wang, T.-C., Valle, R., & Liu, M.-Y. (2023). Space: Speech-driven portrait animation with controllable expression. In: ICCV, pp. 20914\u201320923","DOI":"10.1109\/ICCV51070.2023.01912"},{"key":"2695_CR10","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., & Hochreiter, S. (2017) Gans trained by a two time-scale update rule converge to a local nash equilibrium. NeurIPS 30"},{"key":"2695_CR11","doi-asserted-by":"crossref","unstructured":"Hong, F.-T., Zhang, L., Shen, L., & Xu, D. (2022) Depth-aware generative adversarial network for talking head video generation. In: CVPR, pp. 3397\u20133406","DOI":"10.1109\/CVPR52688.2022.00339"},{"key":"2695_CR12","doi-asserted-by":"crossref","unstructured":"KR, P., Mukhopadhyay, R., Philip, J., Jha, A., Namboodiri, V., & Jawahar, C. (2019) Towards automatic face-to-face translation. In: ACM MM, pp. 1428\u20131436","DOI":"10.1145\/3343031.3351066"},{"key":"2695_CR13","doi-asserted-by":"crossref","unstructured":"Kim, S., Jin, S., Park, J., Kim, K., Kim, J., Nam, J., & Kim, S. (2024) Moditalker: Motion-disentangled diffusion model for high-fidelity talking head generation. arXiv preprint arXiv:2403.19144","DOI":"10.5573\/ieie.2024.61.11.92"},{"key":"2695_CR14","unstructured":"Kingma, D. P. (2013). Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114"},{"key":"2695_CR15","unstructured":"Kingma, D. P. (2014). Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980."},{"key":"2695_CR16","doi-asserted-by":"crossref","unstructured":"Liu, T., Chen, F., Fan, S., Du, C., Chen, Q., Chen, X., & Yu, K. (2024). Anitalker: Animate vivid and diverse talking faces through identity-decoupled facial motion encoding. In: ACM MM, pp. 6696\u20136705","DOI":"10.1145\/3664647.3681198"},{"key":"2695_CR17","unstructured":"Ma, Y., Zhang, S., Wang, J., Wang, X., Zhang, Y., & Deng, Z. (2023). Dreamtalk: When expressive talking head generation meets diffusion probabilistic models. arXiv preprint arXiv:2312.09767"},{"key":"2695_CR18","first-page":"22438","volume":"35","author":"A Mallya","year":"2022","unstructured":"Mallya, A., Wang, T.-C., & Liu, M.-Y. (2022). Implicit warping for animation with image sets. NeurIPS,35, 22438\u201322450.","journal-title":"NeurIPS"},{"key":"2695_CR19","volume":"60","author":"A Nagrani","year":"2019","unstructured":"Nagrani, A., Chung, J. S., Xie, W., & Zisserman, A. (2019). Voxceleb: Large-scale speaker verification in the wild. Computer Science and Language,60, Article 101027.","journal-title":"Computer Science and Language"},{"issue":"9","key":"2695_CR20","first-page":"2678","volume":"20","author":"ND Narvekar","year":"2011","unstructured":"Narvekar, N. D., & Karam, L. J. (2011). A no-reference image blur metric based on the cumulative probability of blur detection (cpbd). IEEE TIP,20(9), 2678\u20132683.","journal-title":"IEEE TIP"},{"key":"2695_CR21","doi-asserted-by":"crossref","unstructured":"Pang, Y., Zhang, Y., Quan, W., Fan, Y., Cun, X., Shan, Y., & Yan, D.-m. (2023). Dpe: Disentanglement of pose and expression for general video portrait editing. In: CVPR, pp. 427\u2013436","DOI":"10.1109\/CVPR52729.2023.00049"},{"key":"2695_CR22","doi-asserted-by":"crossref","unstructured":"Perez, E., Strub, F., De\u00a0Vries, H., Dumoulin, V., & Courville, A. (2018). Film: Visual reasoning with a general conditioning layer. In: AAAI, vol. 32","DOI":"10.1609\/aaai.v32i1.11671"},{"key":"2695_CR23","doi-asserted-by":"crossref","unstructured":"Prajwal, K., Mukhopadhyay, R., Namboodiri, V. P., & Jawahar, C. (2020) A lip sync expert is all you need for speech to lip generation in the wild. In: ACM MM, pp. 484\u2013492","DOI":"10.1145\/3394171.3413532"},{"key":"2695_CR24","doi-asserted-by":"crossref","unstructured":"Ren, Y., Li, G., Chen, Y., Li, T.H., & Liu, S. (2021) Pirenderer: Controllable portrait image generation via semantic neural rendering. In: ICCV, pp. 13759\u201313768","DOI":"10.1109\/ICCV48922.2021.01350"},{"key":"2695_CR25","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., & Ommer, B. (2022). High-resolution image synthesis with latent diffusion models. In: CVPR, pp. 10684\u201310695","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"2695_CR26","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Chong, E., & Rehg, J.M. (2018). Fine-grained head pose estimation without keypoints. In: CVPRW, pp. 2074\u20132083","DOI":"10.1109\/CVPRW.2018.00281"},{"key":"2695_CR27","doi-asserted-by":"crossref","unstructured":"Shen, S., Zhao, W., Meng, Z., Li, W., Zhu, Z., Zhou, J., & Lu, J. (2023). Difftalk: Crafting diffusion models for generalized audio-driven portraits animation. In: CVPR, pp. 1982\u20131991","DOI":"10.1109\/CVPR52729.2023.00197"},{"key":"2695_CR28","unstructured":"Siarohin, A., Lathuili\u00e8re, S., Tulyakov, S., Ricci, E., & Sebe, N. (2019). First order motion model for image animation. NeurIPS 32"},{"key":"2695_CR29","doi-asserted-by":"crossref","unstructured":"Siyao, L., Yu, W., Gu, T., Lin, C., Wang, Q., Qian, C., Loy, C. C., & Liu, Z. (2022). Bailando: 3d dance generation by actor-critic gpt with choreographic memory. In: CVPR, pp. 11050\u201311059","DOI":"10.1109\/CVPR52688.2022.01077"},{"key":"2695_CR30","unstructured":"Song, J., Meng, C., & Ermon, S. (2021). Denoising diffusion implicit models. In: ICLR"},{"key":"2695_CR31","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127063","volume":"568","author":"J Su","year":"2024","unstructured":"Su, J., Ahmed, M., Lu, Y., Pan, S., Bo, W., & Liu, Y. (2024). Roformer: Enhanced transformer with rotary position embedding. Neurocomputing,568, Article 127063.","journal-title":"Neurocomputing"},{"issue":"4","key":"2695_CR32","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073640","volume":"36","author":"S Suwajanakorn","year":"2017","unstructured":"Suwajanakorn, S., Seitz, S. M., & Kemelmacher-Shlizerman, I. (2017). Synthesizing obama: learning lip sync from audio. ACM TOG,36(4), 1\u201313.","journal-title":"ACM TOG"},{"key":"2695_CR33","doi-asserted-by":"crossref","unstructured":"Tao, J., Wang, B., Ge, T., Jiang, Y., Li, W., & Duan, L. (2022). Motion transformer for unsupervised image animation. In: ECCV, (pp. 702\u2013719). Springer.","DOI":"10.1007\/978-3-031-19787-1_40"},{"key":"2695_CR34","doi-asserted-by":"crossref","unstructured":"Tian, L., Wang, Q., Zhang, B., & Bo, L. (2024). Emo: Emote portrait alive-generating expressive portrait videos with audio2video diffusion model under weak conditions. arXiv preprint arXiv:2402.17485","DOI":"10.1007\/978-3-031-73010-8_15"},{"key":"2695_CR35","doi-asserted-by":"crossref","unstructured":"Wang, T.-C., Mallya, A., & Liu, M.-Y. (2021). One-shot free-view neural talking-head synthesis for video conferencing. In: CVPR","DOI":"10.1109\/CVPR46437.2021.00991"},{"key":"2695_CR36","unstructured":"Wang, Y., Yang, D., Bremond, F., & Dantcheva, A. (2022). Latent image animator: Learning to animate images via latent space navigation. arXiv preprint arXiv:2203.09043."},{"key":"2695_CR37","doi-asserted-by":"publisher","first-page":"2531","DOI":"10.1609\/aaai.v36i3.20154","volume":"36","author":"S Wang","year":"2022","unstructured":"Wang, S., Li, L., Ding, Y., & Yu, X. (2022). One-shot talking face generation from single-speaker audio-visual correlation learning. AAAI,36, 2531\u20132539.","journal-title":"AAAI"},{"key":"2695_CR38","unstructured":"Wei, H., Yang, Z., & Wang, Z. (2024). Aniportrait: Audio-driven synthesis of photorealistic portrait animation. arXiv preprint arXiv:2403.17694"},{"key":"2695_CR39","doi-asserted-by":"crossref","unstructured":"Wiles, O., Koepke, A., & Zisserman, A. (2018). X2face: A network for controlling face generation using images, audio, and pose codes. In: ECCV, pp. 670\u2013686","DOI":"10.1007\/978-3-030-01261-8_41"},{"key":"2695_CR40","first-page":"660","volume":"37","author":"S Xu","year":"2024","unstructured":"Xu, S., Chen, G., Guo, Y.-X., Yang, J., Li, C., Zang, Z., Zhang, Y., Tong, X., & Guo, B. (2024). Vasa-1: Lifelike audio-driven talking faces generated in real time. NeurIPS,37, 660\u2013684.","journal-title":"NeurIPS"},{"key":"2695_CR41","doi-asserted-by":"publisher","first-page":"9256","DOI":"10.1609\/aaai.v39i9.33002","volume":"39","author":"S Yang","year":"2025","unstructured":"Yang, S., Li, H., Wu, J., Jing, M., Li, L., Ji, R., Liang, J., Fan, H., & Wang, J. (2025). Megactor-sigma: Unlocking flexible mixed-modal control in portrait animation with diffusion transformer. AAAI,39, 9256\u20139264.","journal-title":"AAAI"},{"key":"2695_CR42","unstructured":"Ye, Z., Zhong, T., Ren, Y., Yang, J., Li, W., Huang, J., Jiang, Z., He, J., Huang, R., & Liu, J. (2024). Real3d-portrait: One-shot realistic 3d talking portrait synthesis. In: ICLR"},{"key":"2695_CR43","doi-asserted-by":"crossref","unstructured":"Yin, F., Zhang, Y., Cun, X., Cao, M., Fan, Y., Wang, X., Bai, Q., Wu, B., Wang, J., & Yang, Y. (2022). Styleheat: One-shot high-resolution editable talking face generation via pre-trained stylegan. In In: ECCV, (pp. 85\u2013101). Springer.","DOI":"10.1007\/978-3-031-19790-1_6"},{"key":"2695_CR44","doi-asserted-by":"crossref","unstructured":"Zeng, B., Liu, X., Gao, S., Liu, B., Li, H., Liu, J., & Zhang, B. (2023). Face animation with an attribute-guided diffusion model. In: CVPR, pp. 628\u2013637","DOI":"10.1109\/CVPRW59228.2023.00070"},{"key":"2695_CR45","doi-asserted-by":"crossref","unstructured":"Zhang, W., Cun, X., Wang, X., Zhang, Y., Shen, X., Guo, Y., Shan, Y., & Wang, F. (2023) Sadtalker: Learning realistic 3d motion coefficients for stylized audio-driven single image talking face animation. In: CVPR, pp. 8652\u20138661","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"2695_CR46","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Li, L., Ding, Y., & Fan, C. (2021). Flow-guided one-shot talking face generation with a high-resolution audio-visual dataset. In: CVPR, pp. 3661\u20133670","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"2695_CR47","doi-asserted-by":"crossref","unstructured":"Zhao, J., & Zhang, H. (2022). Thin-plate spline motion model for image animation. In: CVPR, pp. 3657\u20133666","DOI":"10.1109\/CVPR52688.2022.00364"},{"issue":"6","key":"2695_CR48","first-page":"1","volume":"39","author":"Y Zhou","year":"2020","unstructured":"Zhou, Y., Han, X., Shechtman, E., Echevarria, J., Kalogerakis, E., & Li, D. (2020). Makelttalk: speaker-aware talking-head animation. ACM TOG,39(6), 1\u201315.","journal-title":"ACM TOG"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02695-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02695-x","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02695-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T08:35:13Z","timestamp":1774600513000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02695-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,6]]},"references-count":48,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["2695"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02695-x","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,6]]},"assertion":[{"value":"14 March 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 September 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no financial or proprietary interests in any material discussed in this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}},{"value":"All source codes are available at\n                      \n                      .","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Code availability"}}],"article-number":"111"}}