{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T16:37:13Z","timestamp":1771951033271,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":81,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key R&D Program of China under grant number","award":["2022ZD0161501"],"award-info":[{"award-number":["2022ZD0161501"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,3]]},"DOI":"10.1145\/3680528.3687587","type":"proceedings-article","created":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T08:14:37Z","timestamp":1733213677000},"page":"1-12","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":30,"title":["Follow-Your-Emoji: Fine-Controllable and Expressive Freestyle Portrait Animation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-7477-5938","authenticated-orcid":false,"given":"Yue","family":"Ma","sequence":"first","affiliation":[{"name":"Hong Kong University of Science and Technology, HongKong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4628-6388","authenticated-orcid":false,"given":"Hongyu","family":"Liu","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, HongKong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8230-9471","authenticated-orcid":false,"given":"Hongfa","family":"Wang","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9753-2127","authenticated-orcid":false,"given":"Heng","family":"Pan","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0134-8220","authenticated-orcid":false,"given":"Yingqing","family":"He","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, HongKong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0012-7397","authenticated-orcid":false,"given":"Junkun","family":"Yuan","sequence":"additional","affiliation":[{"name":"Tencent, ShenZhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3783-0679","authenticated-orcid":false,"given":"Ailing","family":"Zeng","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0811-3193","authenticated-orcid":false,"given":"Chengfei","family":"Cai","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4684-911X","authenticated-orcid":false,"given":"Heung-Yeung","family":"Shum","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3865-8145","authenticated-orcid":false,"given":"Wei","family":"Liu","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2199-3948","authenticated-orcid":false,"given":"Qifeng","family":"Chen","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, HongKong, Hong Kong"}]}],"member":"320","published-online":{"date-parts":[[2024,12,3]]},"reference":[{"key":"e_1_3_3_1_2_1","unstructured":"2023. civitai. https:\/\/civitai.com\/models\/443821\/cyberrealistic-pony."},{"key":"e_1_3_3_1_3_1","unstructured":"2023. duchaitenpony-real. https:\/\/civitai.com\/models\/477851\/duchaiten-pony-real."},{"key":"e_1_3_3_1_4_1","unstructured":"2023. Gen-2. https:\/\/runwayml.com\/ai-magic-tools\/gen-2\/."},{"key":"e_1_3_3_1_5_1","unstructured":"2023. wairealmix. https:\/\/civitai.com\/models\/393905\/wai-realmix."},{"key":"e_1_3_3_1_6_1","doi-asserted-by":"crossref","unstructured":"Hadar Averbuch-Elor Daniel Cohen-Or Johannes Kopf and Michael\u00a0F Cohen. 2017. Bringing portraits to life. ACM transactions on graphics (TOG) 36 6 (2017) 1\u201313.","DOI":"10.1145\/3130800.3130818"},{"key":"e_1_3_3_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"e_1_3_3_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"e_1_3_3_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02062"},{"key":"e_1_3_3_1_10_1","unstructured":"Di Chang Yichun Shi Quankai Gao Jessica Fu Hongyi Xu Guoxian Song Qing Yan Yizhe Zhu Xiao Yang and Mohammad Soleymani. 2024. MagicPose: Realistic Human Poses and Facial Expressions Retargeting with Identity-aware Diffusion. arxiv:https:\/\/arXiv.org\/abs\/2311.12052\u00a0[cs.CV]"},{"key":"e_1_3_3_1_11_1","unstructured":"Haoxin Chen Menghan Xia Yingqing He Yong Zhang Xiaodong Cun Shaoshu Yang Jinbo Xing Yaofang Liu Qifeng Chen Xintao Wang Chao Weng and Ying Shan. 2023. VideoCrafter1: Open Diffusion Models for High-Quality Video Generation. arxiv:https:\/\/arXiv.org\/abs\/2310.19512\u00a0[cs.CV]"},{"key":"e_1_3_3_1_12_1","doi-asserted-by":"crossref","unstructured":"Haoxin Chen Yong Zhang Xiaodong Cun Menghan Xia Xintao Wang Chao Weng and Ying Shan. 2024. VideoCrafter2: Overcoming Data Limitations for High-Quality Video Diffusion Models. arxiv:https:\/\/arXiv.org\/abs\/2401.09047\u00a0[cs.CV]","DOI":"10.1109\/CVPR52733.2024.00698"},{"key":"e_1_3_3_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00482"},{"key":"e_1_3_3_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00520"},{"key":"e_1_3_3_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547838"},{"key":"e_1_3_3_1_16_1","doi-asserted-by":"crossref","unstructured":"Yao Feng Haiwen Feng Michael\u00a0J Black and Timo Bolkart. 2021. Learning an animatable detailed 3D face model from in-the-wild images. ACM Transactions on Graphics (ToG) 40 4 (2021) 1\u201313.","DOI":"10.1145\/3476576.3476646"},{"key":"e_1_3_3_1_17_1","doi-asserted-by":"crossref","unstructured":"Ohad Fried Ayush Tewari Michael Zollh\u00f6fer Adam Finkelstein Eli Shechtman Dan\u00a0B Goldman Kyle Genova Zeyu Jin Christian Theobalt and Maneesh Agrawala. 2019. Text-based editing of talking-head video. ACM Transactions on Graphics (TOG) 38 4 (2019) 1\u201314.","DOI":"10.1145\/3306346.3323028"},{"key":"e_1_3_3_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00707"},{"key":"e_1_3_3_1_19_1","doi-asserted-by":"crossref","unstructured":"Ian Goodfellow Jean Pouget-Abadie Mehdi Mirza Bing Xu David Warde-Farley Sherjil Ozair Aaron Courville and Yoshua Bengio. 2020. Generative adversarial networks. Commun. ACM 63 11 (2020) 139\u2013144.","DOI":"10.1145\/3422622"},{"key":"e_1_3_3_1_20_1","unstructured":"Yuwei Guo Ceyuan Yang Anyi Rao Yaohui Wang Yu Qiao Dahua Lin and Bo Dai. 2023. Animatediff: Animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.04725 (2023)."},{"key":"e_1_3_3_1_21_1","unstructured":"Tianyu He Junliang Guo Runyi Yu Yuchi Wang Jialiang Zhu Kaikai An Leyi Li Xu Tan Chunyu Wang Han Hu et\u00a0al. 2023. GAIA: Zero-shot Talking Avatar Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.15230 (2023)."},{"key":"e_1_3_3_1_22_1","unstructured":"Yingqing He Tianyu Yang Yong Zhang Ying Shan and Qifeng Chen. 2022a. Latent Video Diffusion Models for High-Fidelity Long Video Generation. (2022). arxiv:https:\/\/arXiv.org\/abs\/2211.13221\u00a0[cs.CV]"},{"key":"e_1_3_3_1_23_1","unstructured":"Yingqing He Tianyu Yang Yong Zhang Ying Shan and Qifeng Chen. 2022b. Latent video diffusion models for high-fidelity long video generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2211.13221 (2022)."},{"key":"e_1_3_3_1_24_1","unstructured":"Amir Hertz Ron Mokady Jay Tenenbaum Kfir Aberman Yael Pritch and Daniel Cohen-Or. 2022. Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2208.01626 (2022)."},{"key":"e_1_3_3_1_25_1","unstructured":"Jonathan Ho William Chan Chitwan Saharia Jay Whang Ruiqi Gao Alexey Gritsenko Diederik\u00a0P Kingma Ben Poole Mohammad Norouzi David\u00a0J Fleet et\u00a0al. 2022a. Imagen video: High definition video generation with diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.02303 (2022)."},{"key":"e_1_3_3_1_26_1","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems 33 (2020) 6840\u20136851."},{"key":"e_1_3_3_1_27_1","unstructured":"Jonathan Ho Tim Salimans Alexey Gritsenko William Chan Mohammad Norouzi and David\u00a0J Fleet. 2022b. Video diffusion models. Advances in Neural Information Processing Systems 35 (2022) 8633\u20138646."},{"key":"e_1_3_3_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02108"},{"key":"e_1_3_3_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00339"},{"key":"e_1_3_3_1_30_1","unstructured":"Li Hu Xin Gao Peng Zhang Ke Sun Bang Zhang and Liefeng Bo. 2023. Animate anyone: Consistent and controllable image-to-video synthesis for character animation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.17117 (2023)."},{"key":"e_1_3_3_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20086-1_20"},{"key":"e_1_3_3_1_32_1","doi-asserted-by":"crossref","unstructured":"Hyeongwoo Kim Pablo Garrido Ayush Tewari Weipeng Xu Justus Thies Matthias Niessner Patrick P\u00e9rez Christian Richardt Michael Zollh\u00f6fer and Christian Theobalt. 2018. Deep video portraits. ACM transactions on graphics (TOG) 37 4 (2018) 1\u201314.","DOI":"10.1145\/3197517.3201283"},{"key":"e_1_3_3_1_33_1","unstructured":"Gongye Liu Menghan Xia Yong Zhang Haoxin Chen Jinbo Xing Xintao Wang Yujiu Yang and Ying Shan. 2023b. StyleCrafter: Enhancing Stylized Text-to-Video Generation with Style Adapter. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.00330 (2023)."},{"key":"e_1_3_3_1_34_1","unstructured":"Hongyu Liu Xintong Han Chengbin Jin Lihui Qian Huawei Wei Zhe Lin Faqiang Wang Haoye Dong Yibing Song Jia Xu et\u00a0al. 2023a. Human motionformer: Transferring human motions with vision transformers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.11306 (2023)."},{"key":"e_1_3_3_1_35_1","unstructured":"Shaoteng Liu Yuechen Zhang Wenbo Li Zhe Lin and Jiaya Jia. 2023c. Video-p2p: Video editing with cross-attention control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.04761 (2023)."},{"key":"e_1_3_3_1_36_1","unstructured":"Camillo Lugaresi Jiuqiang Tang Hadon Nash Chris McClanahan Esha Uboweja Michael Hays Fan Zhang Chuo-Ling Chang Ming\u00a0Guang Yong Juhyun Lee et\u00a0al. 2019. Mediapipe: A framework for building perception pipelines. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1906.08172 (2019)."},{"key":"e_1_3_3_1_37_1","unstructured":"Yue Ma Xiaodong Cun Yingqing He Chenyang Qi Xintao Wang Ying Shan Xiu Li and Qifeng Chen. 2023. MagicStick: Controllable Video Editing via Control Handle Transformations. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.03047 (2023)."},{"key":"e_1_3_3_1_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28206"},{"key":"e_1_3_3_1_39_1","unstructured":"Yue Ma Yingqing He Hongfa Wang Andong Wang Chenyang Qi Chengfei Cai Xiu Li Zhifeng Li Heung-Yeung Shum Wei Liu et\u00a0al. 2024b. Follow-Your-Click: Open-domain Regional Image Animation via Short Prompts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.08268 (2024)."},{"key":"e_1_3_3_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01460"},{"key":"e_1_3_3_1_41_1","unstructured":"Linzi Qu Jiaxiang Shang Xiaoguang Han and Hongbo Fu. 2023. ReenactArtFace: Artistic Face Image Reenactment. IEEE Transactions on Visualization and Computer Graphics (2023)."},{"key":"e_1_3_3_1_42_1","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_3_1_43_1","unstructured":"Aditya Ramesh Prafulla Dhariwal Alex Nichol Casey Chu and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.06125 1 2 (2022) 3."},{"key":"e_1_3_3_1_44_1","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2021. High-Resolution Image Synthesis with Latent Diffusion Models. arxiv:https:\/\/arXiv.org\/abs\/2112.10752\u00a0[cs.CV]"},{"key":"e_1_3_3_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_3_1_46_1","unstructured":"Chitwan Saharia William Chan Saurabh Saxena Lala Li Jay Whang Emily\u00a0L Denton Kamyar Ghasemipour Raphael Gontijo\u00a0Lopes Burcu Karagol\u00a0Ayan Tim Salimans et\u00a0al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in neural information processing systems 35 (2022) 36479\u201336494."},{"key":"e_1_3_3_1_47_1","unstructured":"Aliaksandr Siarohin St\u00e9phane Lathuili\u00e8re Sergey Tulyakov Elisa Ricci and Nicu Sebe. 2019. First order motion model for image animation. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_3_1_48_1","unstructured":"Uriel Singer Adam Polyak Thomas Hayes Xi Yin Jie An Songyang Zhang Qiyuan Hu Harry Yang Oron Ashual Oran Gafni et\u00a0al. 2022. Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2209.14792 (2022)."},{"key":"e_1_3_3_1_49_1","first-page":"2256","volume-title":"International conference on machine learning","author":"Sohl-Dickstein Jascha","year":"2015","unstructured":"Jascha Sohl-Dickstein, Eric Weiss, Niru Maheswaranathan, and Surya Ganguli. 2015. Deep unsupervised learning using nonequilibrium thermodynamics. In International conference on machine learning. PMLR, 2256\u20132265."},{"key":"e_1_3_3_1_50_1","unstructured":"Jiaming Song Chenlin Meng and Stefano Ermon. 2020. Denoising diffusion implicit models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.02502 (2020)."},{"key":"e_1_3_3_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02011"},{"key":"e_1_3_3_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/2929464.2929475"},{"key":"e_1_3_3_1_53_1","doi-asserted-by":"crossref","unstructured":"Linrui Tian Qi Wang Bang Zhang and Liefeng Bo. 2024. EMO: Emote Portrait Alive-Generating Expressive Portrait Videos with Audio2Video Diffusion Model under Weak Conditions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.17485 (2024).","DOI":"10.1007\/978-3-031-73010-8_15"},{"key":"e_1_3_3_1_54_1","unstructured":"Thomas Unterthiner Sjoerd Van\u00a0Steenkiste Karol Kurach Raphael Marinier Marcin Michalski and Sylvain Gelly. 2018. Towards accurate generative models of video: A new metric & challenges. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1812.01717 (2018)."},{"key":"e_1_3_3_1_55_1","unstructured":"Haofan Wang Qixun Wang Xu Bai Zekui Qin and Anthony Chen. 2024a. InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.02733 (2024)."},{"key":"e_1_3_3_1_56_1","unstructured":"Haofan Wang Peng Xing Renyuan Huang Hao Ai Qixun Wang and Xu Bai. 2024b. InstantStyle-Plus: Style Transfer with Content-Preserving in Text-to-Image Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.00788 (2024)."},{"key":"e_1_3_3_1_57_1","unstructured":"Jiuniu Wang Hangjie Yuan Dayou Chen Yingya Zhang Xiang Wang and Shiwei Zhang. 2023b. Modelscope text-to-video technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.06571 (2023)."},{"key":"e_1_3_3_1_58_1","unstructured":"Tan Wang Linjie Li Kevin Lin Yuanhao Zhai Chung-Ching Lin Zhengyuan Yang Hanwang Zhang Zicheng Liu and Lijuan Wang. 2023a. Disco: Disentangled control for realistic human dance generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.00040 (2023)."},{"key":"e_1_3_3_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00991"},{"key":"e_1_3_3_1_60_1","doi-asserted-by":"crossref","unstructured":"Zhou Wang Alan\u00a0C Bovik Hamid\u00a0R Sheikh and Eero\u00a0P Simoncelli. 2004. Image quality assessment: from error visibility to structural similarity. IEEE transactions on image processing 13 4 (2004) 600\u2013612.","DOI":"10.1109\/TIP.2003.819861"},{"key":"e_1_3_3_1_61_1","unstructured":"Huawei Wei Zejun Yang and Zhisheng Wang. 2024. Aniportrait: Audio-driven synthesis of photorealistic portrait animation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.17694 (2024)."},{"key":"e_1_3_3_1_62_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_41"},{"key":"e_1_3_3_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"e_1_3_3_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00081"},{"key":"e_1_3_3_1_65_1","unstructured":"You Xie Hongyi Xu Guoxian Song Chao Wang Yichun Shi and Linjie Luo. 2024. X-Portrait: Expressive Portrait Animation with Hierarchical Motion Attention. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.15931 (2024)."},{"key":"e_1_3_3_1_66_1","doi-asserted-by":"crossref","unstructured":"Jinbo Xing Menghan Xia Yuxin Liu Yuechen Zhang Y He H Liu H Chen X Cun X Wang Y Shan et\u00a0al. 2024. Make-Your-Video: Customized Video Generation Using Textual and Structural Guidance. IEEE Transactions on Visualization and Computer Graphics (2024).","DOI":"10.1109\/TVCG.2024.3365804"},{"key":"e_1_3_3_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01232"},{"key":"e_1_3_3_1_68_1","unstructured":"Sicheng Xu Guojun Chen Yu-Xiao Guo Jiaolong Yang Chong Li Zhenyu Zang Yizhong Zhang Xin Tong and Baining Guo. 2024a. VASA-1: Lifelike Audio-Driven Talking Faces Generated in Real Time. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.10667 (2024)."},{"key":"e_1_3_3_1_69_1","unstructured":"Zhongcong Xu Jianfeng Zhang Jun\u00a0Hao Liew Hanshu Yan Jia-Wei Liu Chenxu Zhang Jiashi Feng and Mike\u00a0Zheng Shou. 2024b. MagicAnimate: Temporally Consistent Human Image Animation using Diffusion Model."},{"key":"e_1_3_3_1_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00455"},{"key":"e_1_3_3_1_71_1","unstructured":"Hu Ye Jun Zhang Sibo Liu Xiao Han and Wei Yang. 2023. IP-Adapter: Text Compatible Image Prompt Adapter for Text-to-Image Diffusion Models. (2023)."},{"key":"e_1_3_3_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00070"},{"key":"e_1_3_3_1_73_1","unstructured":"Qiming Zhang Jing Zhang Yufei Xu and Dacheng Tao. 2023d. Vision Transformer with Quadrangle Attention. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.15105 (2023)."},{"key":"e_1_3_3_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_3_1_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"e_1_3_3_1_76_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01352"},{"key":"e_1_3_3_1_77_1","unstructured":"Yabo Zhang Yuxiang Wei Dongsheng Jiang Xiaopeng Zhang Wangmeng Zuo and Qi Tian. 2023b. Controlvideo: Training-free controllable text-to-video generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.13077 (2023)."},{"key":"e_1_3_3_1_78_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"e_1_3_3_1_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00878"},{"key":"e_1_3_3_1_80_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00364"},{"key":"e_1_3_3_1_81_1","unstructured":"Daquan Zhou Weimin Wang Hanshu Yan Weiwei Lv Yizhe Zhu and Jiashi Feng. 2022. Magicvideo: Efficient video generation with latent diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2211.11018 (2022)."},{"key":"e_1_3_3_1_82_1","unstructured":"Shenhao Zhu Junming\u00a0Leo Chen Zuozhuo Dai Yinghui Xu Xun Cao Yao Yao Hao Zhu and Siyu Zhu. 2024. Champ: Controllable and Consistent Human Image Animation with 3D Parametric Guidance. arxiv:https:\/\/arXiv.org\/abs\/2403.14781\u00a0[cs.CV]"}],"event":{"name":"SA '24: SIGGRAPH Asia 2024 Conference Papers","location":"Tokyo Japan","acronym":"SA '24","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["SIGGRAPH Asia 2024 Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3680528.3687587","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3680528.3687587","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:58:26Z","timestamp":1750294706000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3680528.3687587"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"references-count":81,"alternative-id":["10.1145\/3680528.3687587","10.1145\/3680528"],"URL":"https:\/\/doi.org\/10.1145\/3680528.3687587","relation":{},"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"2024-12-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}