{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,10]],"date-time":"2026-07-10T12:18:04Z","timestamp":1783685884616,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","funder":[{"name":"Key Research and Development Jianbing Program of Zhejiang Province","award":["2023C01002"],"award-info":[{"award-number":["2023C01002"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733464","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:29:43Z","timestamp":1750876183000},"page":"1283-1292","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Vividportraits: Face Parsing Guided Portrait Animation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-3346-3642","authenticated-orcid":false,"given":"Xuze","family":"Tian","sequence":"first","affiliation":[{"name":"School of Software Technology, Zhejiang University, Ningbo, Zhejiang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3427-9014","authenticated-orcid":false,"given":"Jinshan","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Software Technology, Zhejiang University, Ningbo, Zhejiang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7431-3048","authenticated-orcid":false,"given":"Tao","family":"Jiang","sequence":"additional","affiliation":[{"name":"School of Software Technology, Zhejiang University, Ningbo, Zhejiang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4494-193X","authenticated-orcid":false,"given":"Boxi","family":"Wu","sequence":"additional","affiliation":[{"name":"School of Software Technology, Zhejiang University, Ningbo, Zhejiang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6335-8312","authenticated-orcid":false,"given":"Meng","family":"Xi","sequence":"additional","affiliation":[{"name":"School of Software Technology, Zhejiang University, Ningbo, Zhejiang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5313-2742","authenticated-orcid":false,"given":"Zejian","family":"Li","sequence":"additional","affiliation":[{"name":"School of Software Technology, Zhejiang University, Ningbo, Zhejiang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4703-7348","authenticated-orcid":false,"given":"Jianwei","family":"Yin","sequence":"additional","affiliation":[{"name":"School of Software Technology, Zhejiang University, Ningbo, Zhejiang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02062"},{"key":"e_1_3_2_1_3_1","volume-title":"MagicPose: Realistic Human Poses and Facial Expressions Retargeting with Identity-aware Diffusion. In Forty-first International Conference on Machine Learning.","author":"Chang Di","year":"2023","unstructured":"Di Chang, Yichun Shi, Quankai Gao, Hongyi Xu, Jessica Fu, Guoxian Song, Qing Yan, Yizhe Zhu, Xiao Yang, and Mohammad Soleymani. 2023. MagicPose: Realistic Human Poses and Facial Expressions Retargeting with Identity-aware Diffusion. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00482"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2019.00038"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547838"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3272127.3275043"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00995"},{"key":"e_1_3_2_1_10_1","volume-title":"Animatediff: Animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:2307.04725","author":"Guo Yuwei","year":"2023","unstructured":"Yuwei Guo, Ceyuan Yang, Anyi Rao, Zhengyang Liang, Yaohui Wang, Yu Qiao, Maneesh Agrawala, Dahua Lin, and Bo Dai. 2023. Animatediff: Animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:2307.04725 (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"Gaia: Zero-shot talking avatar generation. arXiv preprint arXiv:2311.15230","author":"He Tianyu","year":"2023","unstructured":"Tianyu He, Junliang Guo, Runyi Yu, Yuchi Wang, Jialiang Zhu, Kaikai An, Leyi Li, Xu Tan, Chunyu Wang, Han Hu, et al. 2023. Gaia: Zero-shot talking avatar generation. arXiv preprint arXiv:2311.15230 (2023)."},{"key":"e_1_3_2_1_12_1","volume-title":"ID-Animator: Zero-Shot Identity-Preserving Human Video Generation. arXiv preprint arXiv:2404.15275","author":"He Xuanhua","year":"2024","unstructured":"Xuanhua He, Quande Liu, Shengju Qian, Xin Wang, Tao Hu, Ke Cao, Keyu Yan, Man Zhou, and Jie Zhang. 2024. ID-Animator: Zero-Shot Identity-Preserving Human Video Generation. arXiv preprint arXiv:2404.15275 (2024)."},{"key":"e_1_3_2_1_13_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems, Vol. 33 (2020), 6840--6851."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02108"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00339"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2010.579"},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 8153--8163","author":"Hu Li","year":"2024","unstructured":"Li Hu. 2024. Animate anyone: Consistent and controllable image-to-video synthesis for character animation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 8153--8163."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01521-4"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20086-1_20"},{"key":"e_1_3_2_1_20_1","volume-title":"Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114","author":"Kingma Diederik P","year":"2013","unstructured":"Diederik P Kingma. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)."},{"key":"e_1_3_2_1_21_1","unstructured":"Hongyu Liu Xintong Han Chengbin Jin Lihui Qian Huawei Wei Zhe Lin Faqiang Wang Haoye Dong Yibing Song Jia Xu et al. 2023. Human motionformer: Transferring human motions with vision transformers. arXiv preprint arXiv:2302.11306 (2023)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00821"},{"key":"e_1_3_2_1_23_1","volume-title":"Juhyun Lee, et al.","author":"Lugaresi Camillo","year":"2019","unstructured":"Camillo Lugaresi, Jiuqiang Tang, Hadon Nash, Chris McClanahan, Esha Uboweja, Michael Hays, Fan Zhang, Chuo-Ling Chang, Ming Guang Yong, Juhyun Lee, et al. 2019. Mediapipe: A framework for building perception pipelines. arXiv preprint arXiv:1906.08172 (2019)."},{"key":"e_1_3_2_1_24_1","volume-title":"Magicstick: Controllable video editing via control handle transformations. arXiv preprint arXiv:2312.03047","author":"Ma Yue","year":"2023","unstructured":"Yue Ma, Xiaodong Cun, Yingqing He, Chenyang Qi, Xintao Wang, Ying Shan, Xiu Li, and Qifeng Chen. 2023. Magicstick: Controllable video editing via control handle transformations. arXiv preprint arXiv:2312.03047 (2023)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Yue Ma Hongyu Liu Hongfa Wang Heng Pan Yingqing He Junkun Yuan Ailing Zeng Chengfei Cai Heung-Yeung Shum Wei Liu et al. 2024. Follow-Your-Emoji: Fine-Controllable and Expressive Freestyle Portrait Animation. arXiv preprint arXiv:2406.01900 (2024).","DOI":"10.1145\/3680528.3687587"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01460"},{"key":"e_1_3_2_1_27_1","volume-title":"ReenactArtFace: Artistic Face Image Reenactment","author":"Qu Linzi","year":"2023","unstructured":"Linzi Qu, Jiaxiang Shang, Xiaoguang Han, and Hongbo Fu. 2023. ReenactArtFace: Artistic Face Image Reenactment. IEEE Transactions on Visualization and Computer Graphics (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01350"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_31_1","volume-title":"U-net: Convolutional networks for biomedical image segmentation. In Medical image computing and computer-assisted intervention--MICCAI 2015: 18th international conference","author":"Ronneberger Olaf","year":"2015","unstructured":"Olaf Ronneberger, Philipp Fischer, and Thomas Brox. 2015. U-net: Convolutional networks for biomedical image segmentation. In Medical image computing and computer-assisted intervention--MICCAI 2015: 18th international conference, Munich, Germany, October 5--9, 2015, proceedings, part III 18. Springer, 234--241."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2013.59"},{"key":"e_1_3_2_1_34_1","volume-title":"Burcu Karagol Ayan, Tim Salimans, et al.","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in neural information processing systems, Vol. 35 (2022), 36479--36494."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00248"},{"key":"e_1_3_2_1_36_1","volume-title":"First order motion model for image animation. Advances in neural information processing systems","author":"Siarohin Aliaksandr","year":"2019","unstructured":"Aliaksandr Siarohin, St\u00e9phane Lathuili\u00e8re, Sergey Tulyakov, Elisa Ricci, and Nicu Sebe. 2019b. First order motion model for image animation. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01344"},{"key":"e_1_3_2_1_38_1","volume-title":"Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502","author":"Song Jiaming","year":"2020","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2020a. Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)."},{"key":"e_1_3_2_1_39_1","volume-title":"Score-based generative modeling through stochastic differential equations. arXiv preprint arXiv:2011.13456","author":"Song Yang","year":"2020","unstructured":"Yang Song, Jascha Sohl-Dickstein, Diederik P Kingma, Abhishek Kumar, Stefano Ermon, and Ben Poole. 2020b. Score-based generative modeling through stochastic differential equations. arXiv preprint arXiv:2011.13456 (2020)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02011"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00991"},{"key":"e_1_3_2_1_42_1","volume-title":"Image quality assessment: from error visibility to structural similarity","author":"Wang Zhou","year":"2004","unstructured":"Zhou Wang, Alan C Bovik, Hamid R Sheikh, and Eero P Simoncelli. 2004. Image quality assessment: from error visibility to structural similarity. IEEE transactions on image processing, Vol. 13, 4 (2004), 600--612."},{"key":"e_1_3_2_1_43_1","volume-title":"Q-align: Teaching lmms for visual scoring via discrete text-defined levels. arXiv preprint arXiv:2312.17090","author":"Wu Haoning","year":"2023","unstructured":"Haoning Wu, Zicheng Zhang, Weixia Zhang, Chaofeng Chen, Liang Liao, Chunyi Li, Yixuan Gao, Annan Wang, Erli Zhang, Wenxiu Sun, et al. 2023. Q-align: Teaching lmms for visual scoring via discrete text-defined levels. arXiv preprint arXiv:2312.17090 (2023)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00081"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657459"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01232"},{"key":"e_1_3_2_1_47_1","volume-title":"Yao Yao, and Siyu Zhu.","author":"Xu Mingwang","year":"2024","unstructured":"Mingwang Xu, Hui Li, Qingkun Su, Hanlin Shang, Liwei Zhang, Ce Liu, Jingdong Wang, Luc Van Gool, Yao Yao, and Siyu Zhu. 2024b. Hallo: Hierarchical Audio-Driven Visual Synthesis for Portrait Image Animation. arXiv preprint arXiv:2406.08801 (2024)."},{"key":"e_1_3_2_1_48_1","volume-title":"Vasa-1: Lifelike audio-driven talking faces generated in real time. arXiv preprint arXiv:2404.10667","author":"Xu Sicheng","year":"2024","unstructured":"Sicheng Xu, Guojun Chen, Yu-Xiao Guo, Jiaolong Yang, Chong Li, Zhenyu Zang, Yizhong Zhang, Xin Tong, and Baining Guo. 2024a. Vasa-1: Lifelike audio-driven talking faces generated in real time. arXiv preprint arXiv:2404.10667 (2024)."},{"key":"e_1_3_2_1_49_1","volume-title":"MegActor: Harness the Power of Raw Video for Vivid Portrait Animation. arXiv preprint arXiv:2405.20851","author":"Yang Shurong","year":"2024","unstructured":"Shurong Yang, Huadong Li, Juhao Wu, Minhao Jing, Linze Li, Renhe Ji, Jiajun Liang, and Haoqiang Fan. 2024. MegActor: Harness the Power of Raw Video for Vivid Portrait Animation. arXiv preprint arXiv:2405.20851 (2024)."},{"key":"e_1_3_2_1_50_1","volume-title":"Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models. arXiv preprint arXiv:2308.06721","author":"Ye Hu","year":"2023","unstructured":"Hu Ye, Jun Zhang, Sibo Liu, Xiao Han, and Wei Yang. 2023. Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models. arXiv preprint arXiv:2308.06721 (2023)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00070"},{"key":"e_1_3_2_1_52_1","volume-title":"Dream-talk: diffusion-based realistic emotional audio-driven method for single image talking face generation. arXiv preprint arXiv:2312.13578","author":"Zhang Chenxu","year":"2023","unstructured":"Chenxu Zhang, Chao Wang, Jianfeng Zhang, Hongyi Xu, Guoxian Song, You Xie, Linjie Luo, Yapeng Tian, Xiaohu Guo, and Jiashi Feng. 2023b. Dream-talk: diffusion-based realistic emotional audio-driven method for single image talking face generation. arXiv preprint arXiv:2312.13578 (2023)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00364"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01814"}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","location":"Chicago IL USA","acronym":"ICMR '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733464","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:13:22Z","timestamp":1755749602000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733464"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":56,"alternative-id":["10.1145\/3731715.3733464","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733464","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}