{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T09:14:28Z","timestamp":1774602868014,"version":"3.50.1"},"reference-count":120,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T00:00:00Z","timestamp":1771632000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T00:00:00Z","timestamp":1771632000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key R&D Program of China","doi-asserted-by":"crossref","award":["2022ZD0161501"],"award-info":[{"award-number":["2022ZD0161501"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s11263-025-02685-z","type":"journal-article","created":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T07:39:19Z","timestamp":1771659559000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Follow-Your-Emoji-Faster: Towards Efficient, Fine-Controllable, and Expressive Freestyle Portrait Animation"],"prefix":"10.1007","volume":"134","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-7477-5938","authenticated-orcid":false,"given":"Yue","family":"Ma","sequence":"first","affiliation":[]},{"given":"Zexuan","family":"Yan","sequence":"additional","affiliation":[]},{"given":"Hongyu","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Hongfa","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Heng","family":"Pan","sequence":"additional","affiliation":[]},{"given":"Yingqing","family":"He","sequence":"additional","affiliation":[]},{"given":"Junkun","family":"Yuan","sequence":"additional","affiliation":[]},{"given":"Ailing","family":"Zeng","sequence":"additional","affiliation":[]},{"given":"Chengfei","family":"Cai","sequence":"additional","affiliation":[]},{"given":"Heung-Yeung","family":"Shum","sequence":"additional","affiliation":[]},{"given":"Zhifeng","family":"Li","sequence":"additional","affiliation":[]},{"given":"Wei","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Linfeng","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Qifeng","family":"Chen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,2,21]]},"reference":[{"issue":"6","key":"2685_CR1","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3130800.3130818","volume":"36","author":"H Averbuch-Elor","year":"2017","unstructured":"Averbuch-Elor, H., Cohen-Or, D., Kopf, J., & Cohen, M. F. (2017). Bringing portraits to life. ACM transactions on graphics (TOG),36(6), 1\u201313.","journal-title":"ACM transactions on graphics (TOG)"},{"key":"2685_CR2","doi-asserted-by":"crossref","unstructured":"Blattmann, A., Rombach, R., Ling, H., Dockhorn, T., Kim, SW., Fidler, S. & Kreis, K. (2023). Align your latents: High-resolution video synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 22563\u201322575.","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"2685_CR3","doi-asserted-by":"crossref","unstructured":"Bolya, D. & Hoffman, J. (2023). Token merging for fast stable diffusion. CVPR Workshop on Efficient Deep Learning for Computer Vision.","DOI":"10.1109\/CVPRW59228.2023.00484"},{"key":"2685_CR4","unstructured":"Bolya, D., Fu, CY., Dai, X., Zhang, P., Feichtenhofer, C. & Hoffman, J. (2023). Token merging: Your vit but faster. arXiv:2210.09461."},{"key":"2685_CR5","doi-asserted-by":"crossref","unstructured":"Brooks, T., Holynski, A. & Efros, AA. (2023). Instructpix2pix: Learning to follow image editing instructions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 18392\u201318402.","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"2685_CR6","doi-asserted-by":"crossref","unstructured":"Bulat, Adrian, T. & Georgios. (2017). How far are we from solving the 2d & 3d face alignment problem? (and a dataset of 230,000 3d facial landmarks). In: International Conference on Computer Vision","DOI":"10.1109\/ICCV.2017.116"},{"key":"2685_CR7","doi-asserted-by":"crossref","unstructured":"Cao, M., Wang, X., Qi, Z., Shan, Y., Qie, X. & Zheng, Y. (2023). Masactrl: Tuning-free mutual self-attention control for consistent image synthesis and editing. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 22560\u201322570.","DOI":"10.1109\/ICCV51070.2023.02062"},{"key":"2685_CR8","unstructured":"Chang, D., Shi, Y., Gao, Q., Fu, J., Xu, H., Song, G., Yan, Q., Zhu, Y., Yang, X. & Soleymani, M. (2024). Magicpose: Realistic human poses and facial expressions retargeting with identity-aware diffusion. arXiv:2311.12052"},{"key":"2685_CR9","doi-asserted-by":"crossref","unstructured":"Chen, Z., Cao, J., Chen, Z., Li, Y. & Ma, C. (2024c). Echomimic: Lifelike audio-driven portrait animations through editable landmark conditions. arXiv:2407.08136","DOI":"10.1609\/aaai.v39i3.32241"},{"key":"2685_CR10","unstructured":"Chen, P., Shen, M., Ye, P., & others. (2024b). $$\\delta $$-dit: A training-free acceleration method tailored for diffusion transformers. arXiv preprint arXiv:2406.01125"},{"key":"2685_CR11","unstructured":"Chen, H., Xia, M., He, Y., Zhang, Y., Cun, X., Yang, S., Xing, J., Liu, Y., Chen, Q., Wang, X., Weng, C. & Shan, Y. (2023). Videocrafter1: Open diffusion models for high-quality video generation. arXiv:2310.19512"},{"key":"2685_CR12","doi-asserted-by":"crossref","unstructured":"Chen, H., Zhang, Y., Cun, X., Xia, M., Wang, X., Weng, C. & Shan, Y. (2024a). Videocrafter2: Overcoming data limitations for high-quality video diffusion models. arXiv:2401.09047","DOI":"10.1109\/CVPR52733.2024.00698"},{"key":"2685_CR13","unstructured":"civitai. (2023). https:\/\/civitai.com\/models\/443821\/cyberrealistic-pony"},{"key":"2685_CR14","unstructured":"Cui, J., Li, H., Yao, Y., Zhu, H., Shang, H., Cheng, K., Zhou, H. & Jingdong\u00a0Wang SZ. (2024). Hallo2: Long-duration and high-resolution audio-driven portrait image animation. arXiv:2410.07718"},{"key":"2685_CR15","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Xue, N. & Zafeiriou, S. (2019). Arcface: Additive angular margin loss for deep face recognition. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 4690\u20134699.","DOI":"10.1109\/CVPR.2019.00482"},{"key":"2685_CR16","doi-asserted-by":"crossref","unstructured":"Deng, Y., Yang, J., Chen, D., Wen, F. & Tong, X. (2020). Disentangled and controllable face image generation via 3d imitative-contrastive learning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 5154\u20135163.","DOI":"10.1109\/CVPR42600.2020.00520"},{"key":"2685_CR17","doi-asserted-by":"crossref","unstructured":"Drobyshev, N., Chelishev, J., Khakhulin, T., Ivakhnenko, A., Lempitsky, V. & Zakharov, E. (2022). Megaportraits: One-shot megapixel neural head avatars. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 2663\u20132671.","DOI":"10.1145\/3503161.3547838"},{"key":"2685_CR18","unstructured":"duchaitenpony-real. (2023). https:\/\/civitai.com\/models\/477851\/duchaiten-pony-real"},{"key":"2685_CR19","doi-asserted-by":"crossref","unstructured":"Feng, K., Ma, Y., Wang, B., Qi, C., Chen, H., Chen, Q. & Wang, Z. (2024). Dit4edit: Diffusion transformer for image editing. arXiv preprint arXiv:2411.03286.","DOI":"10.1609\/aaai.v39i3.32304"},{"issue":"4","key":"2685_CR20","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3450626.3459936","volume":"40","author":"Y Feng","year":"2021","unstructured":"Feng, Y., Feng, H., Black, M. J., & Bolkart, T. (2021). Learning an animatable detailed 3d face model from in-the-wild images. ACM Transactions on Graphics (ToG),40(4), 1\u201313.","journal-title":"ACM Transactions on Graphics (ToG)"},{"issue":"4","key":"2685_CR21","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3306346.3323028","volume":"38","author":"O Fried","year":"2019","unstructured":"Fried, O., Tewari, A., Zollh\u00f6fer, M., Finkelstein, A., Shechtman, E., Goldman, D. B., Genova, K., Jin, Z., Theobalt, C., & Agrawala, M. (2019). Text-based editing of talking-head video. ACM Transactions on Graphics (TOG),38(4), 1\u201314.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"2685_CR22","doi-asserted-by":"crossref","unstructured":"Gao, J., Liang, K., Wei, T., Chen, W., Ma, Z. & Guo, J. (2024). Dual-prior augmented decoding network for long tail distribution in hoi detection. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp 1806\u20131814.","DOI":"10.1609\/aaai.v38i3.27949"},{"key":"2685_CR23","unstructured":"Gen-2. (2023). https:\/\/runwayml.com\/ai-magic-tools\/gen-2\/"},{"key":"2685_CR24","doi-asserted-by":"crossref","unstructured":"Gong, Y., Zhang, Y., Cun, X., Yin, F., Fan, Y., Wang, X., Wu, B. & Yang, Y. (2023). Toontalker: Cross-domain face reenactment. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 7690\u20137700.","DOI":"10.1109\/ICCV51070.2023.00707"},{"issue":"11","key":"2685_CR25","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1145\/3422622","volume":"63","author":"I Goodfellow","year":"2020","unstructured":"Goodfellow, I., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., Courville, A., & Bengio, Y. (2020). Generative adversarial networks. Communications of the ACM,63(11), 139\u2013144.","journal-title":"Communications of the ACM"},{"key":"2685_CR26","unstructured":"Guo, Y., Yang, C., Rao, A., Wang, Y., Qiao, Y., Lin, D. & Dai, B. (2023). Animatediff: Animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:2307.04725."},{"key":"2685_CR27","unstructured":"He, T., Guo, J., Yu, R., Wang, Y., Zhu, J., An, K., Li, L., Tan, X., Wang, C., Hu, H., & others. (2023a). Gaia: Zero-shot talking avatar generation. arXiv preprint arXiv:2311.15230."},{"key":"2685_CR28","unstructured":"He, Y., Yang, S., Chen, H., Cun, X., Xia, M., Zhang, Y., Wang, X., He, R., Chen, Q. & Shan, Y. (2023b). Scalecrafter: Tuning-free higher-resolution visual generation with diffusion models. In: The Twelfth International Conference on Learning Representations."},{"key":"2685_CR29","unstructured":"He, Y., Yang, T., Zhang, Y., Shan, Y. & Chen, Q. (2022a). Latent video diffusion models for high-fidelity long video generation arXiv:2211.13221 [cs.CV]."},{"key":"2685_CR30","unstructured":"He, Y., Yang, T., Zhang, Y., Shan, Y. & Chen, Q. (2022b). Latent video diffusion models for high-fidelity long video generation. arXiv preprint arXiv:2211.13221"},{"key":"2685_CR31","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y. & Cohen-Or, D. (2022). Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626."},{"key":"2685_CR32","doi-asserted-by":"crossref","unstructured":"Ho, J., Chan, W., Saharia, C., Whang, J., Gao, R., Gritsenko, A., Kingma, DP., Poole, B., Norouzi, M., Fleet, DJ., & others. (2022a). Imagen video: High definition video generation with diffusion models. arXiv preprint arXiv:2210.02303","DOI":"10.52202\/068431-0628"},{"key":"2685_CR33","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., & Abbeel, P. (2020). Denoising diffusion probabilistic models. Advances in neural information processing systems,33, 6840\u20136851.","journal-title":"Advances in neural information processing systems"},{"key":"2685_CR34","doi-asserted-by":"crossref","unstructured":"Hong, FT. & Xu, D. (2023). Implicit identity representation conditioned memory compensation network for talking head video generation. In: ICCV.","DOI":"10.1109\/ICCV51070.2023.02108"},{"key":"2685_CR35","doi-asserted-by":"crossref","unstructured":"Hong, FT., Zhang, L., Shen, L. & Xu, D. (2022). Depth-aware generative adversarial network for talking head video generation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 3397\u20133406.","DOI":"10.1109\/CVPR52688.2022.00339"},{"key":"2685_CR36","first-page":"8633","volume":"35","author":"J Ho","year":"2022","unstructured":"Ho, J., Salimans, T., Gritsenko, A., Chan, W., Norouzi, M., & Fleet, D. J. (2022). Video diffusion models. Advances in Neural Information Processing Systems,35, 8633\u20138646.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2685_CR37","unstructured":"Hu, L., Gao, X., Zhang, P., Sun, K., Zhang, B. & Bo, L. (2023). Animate anyone: Consistent and controllable image-to-video synthesis for character animation. arXiv preprint arXiv:2311.17117"},{"key":"2685_CR38","unstructured":"Jiang, J., Liang, C., Yang, J., Lin, G., Zhong, T. & Zheng, Y. (2025). Loopy: Taming audio-driven portrait avatar with long-term motion dependency. arXiv:2409.02634"},{"key":"2685_CR39","doi-asserted-by":"crossref","unstructured":"Khakhulin, T., Sklyarova, V., Lempitsky, V. & Zakharov, E. (2022). Realistic one-shot mesh-based head avatars. In: European Conference on Computer Vision, Springer, pp 345\u2013362","DOI":"10.1007\/978-3-031-20086-1_20"},{"issue":"4","key":"2685_CR40","first-page":"1","volume":"37","author":"H Kim","year":"2018","unstructured":"Kim, H., Garrido, P., Tewari, A., Xu, W., Thies, J., Niessner, M., P\u00e9rez, P., Richardt, C., Zollh\u00f6fer, M., & Theobalt, C. (2018). Deep video portraits. ACM transactions on graphics (TOG),37(4), 1\u201314.","journal-title":"ACM transactions on graphics (TOG)"},{"key":"2685_CR41","unstructured":"Kingma, DP. & Welling, M. (2013). Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114"},{"key":"2685_CR42","doi-asserted-by":"crossref","unstructured":"Lin, Y., Fung, H., Xu, J., Ren, Z., Lau, AS., Yin, G. & Li, X. (2025). Mvportrait: Text-guided motion and emotion control for multi-view vivid portrait animation. arXiv preprint arXiv:2503.19383","DOI":"10.1109\/CVPR52734.2025.02444"},{"key":"2685_CR43","unstructured":"Liu, X., Gong, C. & Liu, Q. (2022). Flow straight and fast: Learning to generate and transfer data with rectified flow. arXiv:2209.03003"},{"key":"2685_CR44","unstructured":"Liu, H., Han, X., Jin, C., Qian, L., Wei, H., Lin, Z., Wang, F., Dong, H., Song, Y., Xu, J. & others. (2023b). Human motionformer: Transferring human motions with vision transformers. arXiv preprint arXiv:2302.11306"},{"key":"2685_CR45","doi-asserted-by":"crossref","unstructured":"Liu, H., Wang, X., Wan, Z., Ma, Y., Chen, J., Fan, Y., Shen, Y., Song, Y. & Chen, Q. (2025a). Avatarartist: Open-domain 4d avatarization. arXiv preprint arXiv:2503.19906","DOI":"10.1109\/CVPR52734.2025.01005"},{"key":"2685_CR46","unstructured":"Liu, G., Xia, M., Zhang, Y., Chen, H., Xing, J., Wang, X., Yang, Y. & Shan, Y. (2023a). Stylecrafter: Enhancing stylized text-to-video generation with style adapter. arXiv preprint arXiv:2312.00330."},{"key":"2685_CR47","doi-asserted-by":"crossref","unstructured":"Liu, S., Zhang, Y., Li, W., Lin, Z. & Jia, J. (2023c). Video-p2p: Video editing with cross-attention control. arXiv preprint arXiv:2303.04761","DOI":"10.1109\/CVPR52733.2024.00821"},{"key":"2685_CR48","unstructured":"Liu, H., Zhang, W., Xie, J. & others. (2024). Faster diffusion via temporal attention decomposition arXiv:2404.02747 [cs.CV]"},{"key":"2685_CR49","unstructured":"Liu, J., Zou, C., Lyu, Y., Chen, J. & Zhang, L. (2025b). From reusing to forecasting: Accelerating diffusion models with taylorseers. arXiv preprint arXiv:2503.06923"},{"key":"2685_CR50","unstructured":"Loshchilov, I. & Hutter, F. (2017). Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101"},{"key":"2685_CR51","unstructured":"Lu, C., Zhou, Y., Bao, F., Chen, J., Li, C. & Zhu, J. (2022). Dpm-solver: A fast ode solver for diffusion probabilistic model sampling in around 10 steps. arXiv:2206.00927"},{"key":"2685_CR52","unstructured":"Lu, C., Zhou, Y., Bao, F., Chen, J., Li, C. & Zhu, J. (2023). Dpm-solver++: Fast solver for guided sampling of diffusion probabilistic models. arXiv:2211.01095"},{"key":"2685_CR53","unstructured":"Lugaresi, C., Tang, J., Nash, H., McClanahan, C., Uboweja, E., Hays, M., Zhang, F., Chang, CL., Yong, MG., Lee, J., & others. (2019). Mediapipe: A framework for building perception pipelines. arXiv preprint arXiv:1906.08172"},{"key":"2685_CR54","unstructured":"Ma, Y., Cun, X., He, Y., Qi, C., Wang, X., Shan, Y., Li, X. & Chen, Q. (2023b). Magicstick: Controllable video editing via control handle transformations. arXiv preprint arXiv:2312.03047"},{"key":"2685_CR55","doi-asserted-by":"crossref","unstructured":"Ma, X., Fang, G. & Wang, X. (2023a). Deepcache: Accelerating diffusion models for free. arXiv preprint arXiv:2312.00858","DOI":"10.1109\/CVPR52733.2024.01492"},{"key":"2685_CR56","unstructured":"Ma, X., Fang, G., Mi, M. & Wang, X. (2024a). Learning-to-cache: Accelerating diffusion transformer via layer caching. arXiv preprint arXiv:2406.01733"},{"key":"2685_CR57","doi-asserted-by":"crossref","unstructured":"Ma, Y., He, Y., Cun, X., Wang, X., Chen, S., Li, X. & Chen, Q .(2024b). Follow your pose: Pose-guided text-to-video generation using pose-free videos. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp 4117\u20134125.","DOI":"10.1609\/aaai.v38i5.28206"},{"key":"2685_CR58","doi-asserted-by":"crossref","unstructured":"Ma, Y., He, Y., Wang, H., Wang, A., Qi, C., Cai, C., Li, X., Li, Z., Shum, HY., Liu, W. & others. (2025). Follow-your-click: Open-domain regional image animation via short prompts. In: AAAI","DOI":"10.1609\/aaai.v39i6.32643"},{"key":"2685_CR59","doi-asserted-by":"crossref","unstructured":"Ma, Y., Liu, H., Wang, H., Pan, H., He, Y., Yuan, J., Zeng, A., Cai, C., Shum, HY., Liu, W. & Chen, Q. (2024c). Follow-your-emoji: Fine-controllable and expressive freestyle portrait animation. In: SIGGRAPH Asia Conference Papers.","DOI":"10.1145\/3680528.3687587"},{"key":"2685_CR60","doi-asserted-by":"crossref","unstructured":"Ma, Y., Liu, H., Wang, H., Pan, H., He, Y., Yuan, J., Zeng, A., Cai, C., Shum, HY., Liu, W., & others. (2024d). Follow-your-emoji: Fine-controllable and expressive freestyle portrait animation. arXiv preprint arXiv:2406.01900","DOI":"10.1145\/3680528.3687587"},{"key":"2685_CR61","doi-asserted-by":"crossref","unstructured":"Qi, C., Cun, X., Zhang, Y., Lei, C., Wang, X., Shan, Y. & Chen, Q. (2023). Fatezero: Fusing attentions for zero-shot text-based video editing. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 15932\u201315942.","DOI":"10.1109\/ICCV51070.2023.01460"},{"key":"2685_CR62","doi-asserted-by":"crossref","unstructured":"Qu, L., Shang, J., Han, X. & Fu, H. (2023). Reenactartface: Artistic face image reenactment. IEEE Transactions on Visualization and Computer Graphics.","DOI":"10.1109\/TVCG.2023.3253184"},{"key":"2685_CR63","unstructured":"Radford, A., Kim, JW., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., & others. (2021). Learning transferable visual models from natural language supervision. In: International conference on machine learning, PMLR, pp 8748\u20138763"},{"key":"2685_CR64","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C. & Chen, M. (2022). Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 1(2):3."},{"key":"2685_CR65","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P. & Ommer, B. (2021). High-resolution image synthesis with latent diffusion models. arXiv:2112.10752","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"2685_CR66","doi-asserted-by":"crossref","unstructured":"Ronneberger, O., Fischer, P. & Brox, T. (2015). U-net: Convolutional networks for biomedical image segmentation. In: Medical image computing and computer-assisted intervention\u2013MICCAI 2015: 18th international conference, Munich, Germany, October 5-9, 2015, proceedings, part III 18, Springer, pp 234\u2013241.","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"2685_CR67","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M. & Aberman, K. (2023). Dreambooth: Fine tuning text-to-image diffusion models for subject-driven generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 22500\u201322510.","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"2685_CR68","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., Chan, W., Saxena, S., Li, L., Whang, J., Denton, E. L., Ghasemipour, K., Gontijo Lopes, R., Karagol Ayan, B., Salimans, T., et al. (2022). Photorealistic text-to-image diffusion models with deep language understanding. Advances in neural information processing systems,35, 36479\u201336494.","journal-title":"Advances in neural information processing systems"},{"key":"2685_CR69","unstructured":"Selvaraju, P., Ding, T., Chen, T., Zharkov, I., Liang, L. (2024). Fora: Fast-forward caching in diffusion transformer acceleration. arXiv preprint arXiv:2407.01425"},{"key":"2685_CR70","unstructured":"Siarohin, A., Lathuili\u00e8re, S., Tulyakov, S., Ricci, E. & Sebe, N. (2019). First order motion model for image animation. Advances in neural information processing systems 32."},{"key":"2685_CR71","unstructured":"Singer, U., Polyak, A., Hayes, T., Yin, X., An, J., Zhang, S., Hu, Q., Yang, H., Ashual, O., Gafni, O., & others. (2022). Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792."},{"key":"2685_CR72","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N. & Ganguli, S. (2015). Deep unsupervised learning using nonequilibrium thermodynamics. In: International conference on machine learning, PMLR, pp 2256\u20132265."},{"key":"2685_CR73","unstructured":"Song, Y., Dhariwal, P., Chen, M. & Sutskever, I. (2023). Consistency models."},{"key":"2685_CR74","unstructured":"Song, J., Meng, C. & Ermon, S. (2020). Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502."},{"key":"2685_CR75","unstructured":"Song, J., Meng, C. & Ermon, S. (2022). Denoising diffusion implicit models arXiv:2010.02502 [cs.LG]"},{"key":"2685_CR76","doi-asserted-by":"crossref","unstructured":"Sun, J., Wang, X., Wang, L., Li, X., Zhang, Y., Zhang, H. & Liu, Y. (2023). Next3d: Generative neural texture rasterization for 3d-aware head avatars. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 20991\u201321002.","DOI":"10.1109\/CVPR52729.2023.02011"},{"key":"2685_CR77","doi-asserted-by":"crossref","unstructured":"Thies, J., Zollhofer, M., Stamminger, M., Theobalt, C. & Nie\u00dfner, M. (2016). Face2face: Real-time face capture and reenactment of rgb videos. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 2387\u20132395.","DOI":"10.1109\/CVPR.2016.262"},{"key":"2685_CR78","doi-asserted-by":"crossref","unstructured":"Tian, L., Wang, Q., Zhang, B. & Bo, L. (2024). Emo: Emote portrait alive-generating expressive portrait videos with audio2video diffusion model under weak conditions. arXiv preprint arXiv:2402.17485","DOI":"10.1007\/978-3-031-73010-8_15"},{"key":"2685_CR79","unstructured":"Unterthiner, T., Van\u00a0Steenkiste, S., Kurach, K., Marinier, R., Michalski, M. & Gelly, S. (2018). Towards accurate generative models of video: A new metric & challenges. arXiv preprint arXiv:1812.01717"},{"key":"2685_CR80","unstructured":"wairealmix. (2023). https:\/\/civitai.com\/models\/393905\/wai-realmix"},{"key":"2685_CR81","unstructured":"Wan, T., Wang, A., Ai, B., Wen, B., Mao, C., Xie, CW., Chen, D., Yu, F., Zhao, H., Yang, J., Zeng, J., Wang, J., Zhang, J., Zhou, J., Wang, J., Chen, J., Zhu, K., Zhao, K., Yan, K., Huang, L., Feng, M., Zhang, N., Li, P., Wu, P., Chu, R., Feng, R., Zhang, S., Sun, S., Fang, T., Wang, T., Gui, T., Weng, T., Shen, T., Lin, W., Wang, W., Wang, W., Zhou, W., Wang, W., Shen, W., Yu, W., Shi, X., Huang, X., Xu. X., Kou, Y., Lv, Y., Li, Y., Liu, Y., Wang, Y., Zhang, Y., Huang, Y., Li, Y., Wu, Y., Liu, Y., Pan, Y., Zheng, Y., Hong, Y., Shi, Y., Feng, Y., Jiang, Z., Han, Z., Wu, ZF. & Liu, Z. (2025). Wan: Open and advanced large-scale video generative models. arXiv preprint arXiv:2503.20314"},{"key":"2685_CR82","doi-asserted-by":"crossref","unstructured":"Wang, TC., Mallya, A. & Liu, MY. (2021). One-shot free-view neural talking-head synthesis for video conferencing. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR46437.2021.00991"},{"key":"2685_CR83","doi-asserted-by":"crossref","unstructured":"Wang, T., Li, L., Lin, K., Zhai, Y., Lin, CC., Yang, Z., Zhang, H., Liu, Z. & Wang, L. (2023b). Disco: Disentangled control for realistic human dance generation. arXiv preprint arXiv:2307.00040","DOI":"10.1109\/CVPR52733.2024.00891"},{"key":"2685_CR84","unstructured":"Wang, C., Tian, K., Zhang, J., Guan, Y., Luo, F., Shen, F., Jiang, Z., Gu, Q., Han, X. & Yang, W. (2024a). V-express: Conditional dropout for progressive training of portrait video generation."},{"key":"2685_CR85","unstructured":"Wang, H., Wang, Q., Bai, X., Qin, Z. & Chen, A. (2024b). Instantstyle: Free lunch towards style-preserving in text-to-image generation. arXiv preprint arXiv:2404.02733"},{"key":"2685_CR86","unstructured":"Wang, H., Xing, P., Huang, R., Ai, H., Wang, Q. & Bai, X. (2024c). Instantstyle-plus: Style transfer with content-preserving in text-to-image generation. arXiv preprint arXiv:2407.00788"},{"key":"2685_CR87","unstructured":"Wang, J., Yuan, H., Chen, D., Zhang, Y., Wang, X. & Zhang, S. (2023a). Modelscope text-to-video technical report. arXiv preprint arXiv:2308.06571"},{"issue":"4","key":"2685_CR88","doi-asserted-by":"publisher","first-page":"600","DOI":"10.1109\/TIP.2003.819861","volume":"13","author":"Z Wang","year":"2004","unstructured":"Wang, Z., Bovik, A. C., Sheikh, H. R., & Simoncelli, E. P. (2004). Image quality assessment: from error visibility to structural similarity. IEEE transactions on image processing,13(4), 600\u2013612.","journal-title":"IEEE transactions on image processing"},{"key":"2685_CR89","unstructured":"Wei, H., Yang, Z. & Wang, Z. (2024). Aniportrait: Audio-driven synthesis of photorealistic portrait animation. arXiv preprint arXiv:2403.17694"},{"key":"2685_CR90","doi-asserted-by":"crossref","unstructured":"Wiles, O., Koepke, A. & Zisserman, A. (2018). X2face: A network for controlling face generation using images, audio, and pose codes. In: Proceedings of the European conference on computer vision (ECCV), pp 670\u2013686.","DOI":"10.1007\/978-3-030-01261-8_41"},{"key":"2685_CR91","doi-asserted-by":"crossref","unstructured":"Wu, JZ., Ge, Y., Wang, X., Lei, SW., Gu, Y., Shi, Y., Hsu, W., Shan, Y., Qie, X. & Shou, MZ. (2023). Tune-a-video: One-shot tuning of image diffusion models for text-to-video generation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 7623\u20137633.","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"2685_CR92","unstructured":"Xie, E., Chen, J., Chen, J., & others (2024a). Sana: Efficient high-resolution image synthesis with linear diffusion transformers arXiv:2410.10629 [cs.CV]"},{"key":"2685_CR93","doi-asserted-by":"crossref","unstructured":"Xie, L., Wang, X., Zhang, H., Dong, C. & Shan, Y. (2022). Vfhq: A high-quality dataset and benchmark for video face super-resolution. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 657\u2013666.","DOI":"10.1109\/CVPRW56347.2022.00081"},{"key":"2685_CR94","doi-asserted-by":"crossref","unstructured":"Xie, Y., Xu, H., Song, G., Wang, C., Shi, Y. & Luo, L. (2024b). X-portrait: Expressive portrait animation with hierarchical motion attention. arXiv preprint arXiv:2403.15931","DOI":"10.1145\/3641519.3657459"},{"key":"2685_CR95","doi-asserted-by":"crossref","unstructured":"Xing, J., Xia, M., Liu, Y., Zhang, Y., He, Y., Liu, H., Chen, H., Cun, X., Wang, X., Shan, Y., & others. (2024). Make-your-video: Customized video generation using textual and structural guidance. IEEE Transactions on Visualization and Computer Graphics.","DOI":"10.1109\/TVCG.2024.3365804"},{"key":"2685_CR96","doi-asserted-by":"crossref","unstructured":"Xu, S., Chen, G., Guo, YX., Yang, J., Li, C., Zang, Z., Zhang, Y., Tong, X. & Guo, B. (2024b). Vasa-1: Lifelike audio-driven talking faces generated in real time. arXiv preprint arXiv:2404.10667","DOI":"10.52202\/079017-0021"},{"key":"2685_CR97","unstructured":"Xu, M., Li, H., Su, Q., Shang, H., Zhang, L., Liu, C., Wang, J., Yao, Y. & zhu, S. (2024a). Hallo: Hierarchical audio-driven visual synthesis for portrait image animation. arXiv:2406.08801"},{"key":"2685_CR98","doi-asserted-by":"crossref","unstructured":"Xu, H., Song, G., Jiang, Z., Zhang, J., Shi, Y., Liu, J., Ma, W., Feng, J. & Luo, L. (2023). Omniavatar: Geometry-guided controllable 3d head synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 12814\u201312824","DOI":"10.1109\/CVPR52729.2023.01232"},{"key":"2685_CR99","doi-asserted-by":"crossref","unstructured":"Xu, Z., Yu, Z., Zhou, Z., Zhou, J., Jin, X., Hong, FT., Ji, X., Zhu, J., Cai, C., Tang, S., & others. (2025). Hunyuanportrait: Implicit condition control for enhanced portrait animation. In: Proceedings of the Computer Vision and Pattern Recognition Conference, pp 15909\u201315919.","DOI":"10.1109\/CVPR52734.2025.01483"},{"key":"2685_CR100","doi-asserted-by":"crossref","unstructured":"Xu, Z., Zhang, J., Liew, J. H., Yan, H., Liu, J. W., Zhang, C., Feng, J., & Shou, MZ. (2024). Temporally consistent human image animation using diffusion model: Magicanimate.","DOI":"10.1109\/CVPR52733.2024.00147"},{"key":"2685_CR101","unstructured":"Yan, Z., Ma, Y., Zou, C., Chen, W., Chen, Q. & Zhang, L. (2025). Eedit: Rethinking the spatial and temporal redundancy for efficient image editing. arXiv preprint arXiv:2503.10270"},{"key":"2685_CR102","doi-asserted-by":"crossref","unstructured":"Yang, Z., Zeng, A., Yuan, C. & Li, Y. (2023). Effective whole-body pose estimation with two-stages distillation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 4210\u20134220.","DOI":"10.1109\/ICCVW60793.2023.00455"},{"key":"2685_CR103","doi-asserted-by":"crossref","unstructured":"Yang, J., Zeng, A., Zhang, R. & Zhang, L. (2024). X-pose: Detection any keypoints. ECCV.","DOI":"10.1007\/978-3-031-72952-2_15"},{"key":"2685_CR104","unstructured":"Ye, H., Zhang, J., Liu, S., Han, X. & Yang, W. (2023). Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models."},{"key":"2685_CR105","doi-asserted-by":"crossref","unstructured":"Zeng, B., Liu, X., Gao, S., Liu, B., Li, H., Liu, J. & Zhang, B. (2023). Face animation with an attribute-guided diffusion model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 628\u2013637.","DOI":"10.1109\/CVPRW59228.2023.00070"},{"key":"2685_CR106","doi-asserted-by":"crossref","unstructured":"Zhang, W., Cun, X., Wang, X., Zhang, Y., Shen, X., Guo, Y., Shan, Y. & Wang, F. (2022). Sadtalker: Learning realistic 3d motion coefficients for stylized audio-driven single image talking face animation. arXiv preprint arXiv:2211.12194","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"2685_CR107","doi-asserted-by":"crossref","unstructured":"Zhang, W., Cun, X., Wang, X., Zhang, Y., Shen, X., Guo, Y., Shan, Y. & Wang, F. (2023c). Sadtalker: Learning realistic 3d motion coefficients for stylized audio-driven single image talking face animation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 8652\u20138661.","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"2685_CR108","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, AA., Shechtman, E. & Wang, O. (2018). The unreasonable effectiveness of deep features as a perceptual metric. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 586\u2013595.","DOI":"10.1109\/CVPR.2018.00068"},{"key":"2685_CR109","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Li, L., Ding, Y. & Fan, C. (2021). Flow-guided one-shot talking face generation with a high-resolution audio-visual dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 3661\u20133670.","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"2685_CR110","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A. & Agrawala, M. (2023a). Adding conditional control to text-to-image diffusion models.","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"2685_CR111","doi-asserted-by":"crossref","unstructured":"Zhang, E., Tang, J., Ning, X. & Zhang, L. (2025). Training-free and hardware-friendly acceleration for diffusion models via similarity-based token pruning. In: Proceedings of the AAAI Conference on Artificial Intelligence.","DOI":"10.1609\/aaai.v39i9.33071"},{"key":"2685_CR112","unstructured":"Zhang, Y., Wei, Y., Jiang, D., Zhang, X., Zuo, W. & Tian, Q. (2023e). Controlvideo: Training-free controllable text-to-video generation. arXiv preprint arXiv:2305.13077"},{"key":"2685_CR113","doi-asserted-by":"crossref","unstructured":"Zhang, W., Zhai, G., Wei, Y., Yang, X. & Ma, K. (2023d). Blind image quality assessment via vision-language correspondence: A multitask learning perspective. In: IEEE Conference on Computer Vision and Pattern Recognition, pp 14071\u201314081.","DOI":"10.1109\/CVPR52729.2023.01352"},{"key":"2685_CR114","unstructured":"Zhang, Q., Zhang, J., Xu, Y. & Tao, D. (2023b). Vision transformer with quadrangle attention. arXiv preprint arXiv:2303.15105"},{"key":"2685_CR115","doi-asserted-by":"crossref","unstructured":"Zhao, J. & Zhang, H. (2022). Thin-plate spline motion model for image animation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 3657\u20133666.","DOI":"10.1109\/CVPR52688.2022.00364"},{"key":"2685_CR116","doi-asserted-by":"crossref","unstructured":"Zhao, B., Meng, L., Yin, W. & Sigal, L. (2019). Image generation from layout. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 8584\u20138593.","DOI":"10.1109\/CVPR.2019.00878"},{"key":"2685_CR117","unstructured":"Zhou, D., Wang, W., Yan, H., Lv, W., Zhu, Y. & Feng, J. (2022). Magicvideo: Efficient video generation with latent diffusion models. arXiv preprint arXiv:2211.11018"},{"key":"2685_CR118","doi-asserted-by":"crossref","unstructured":"Zhu, S., Chen, JL., Dai, Z., Xu, Y., Cao, X., Yao, Y., Zhu, H. & Zhu, S. (2024). Champ: Controllable and consistent human image animation with 3d parametric guidance. arXiv:2403.14781","DOI":"10.1007\/978-3-031-73001-6_9"},{"key":"2685_CR119","unstructured":"Zou, C., Liu, X., Liu, T., Huang, S. & Zhang, L. (2024a). Accelerating diffusion transformers with token-wise feature caching. arXiv preprint arXiv:2410.05317"},{"key":"2685_CR120","unstructured":"Zou, C., Zhang, E., Guo, R., & others. (2024b). Accelerating diffusion transformers with dual feature caching. arXiv preprint arXiv:2412.18911"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02685-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02685-z","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02685-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T08:40:42Z","timestamp":1774600842000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02685-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,21]]},"references-count":120,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["2685"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02685-z","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,21]]},"assertion":[{"value":"14 April 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 September 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"130"}}