{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T10:25:52Z","timestamp":1780395952816,"version":"3.54.1"},"reference-count":82,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01597","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"17191-17202","source":"Crossref","is-referenced-by-count":2,"title":["VACE: All-in-One Video Creation and Editing"],"prefix":"10.1109","author":[{"given":"Zeyinzi","family":"Jiang","sequence":"first","affiliation":[{"name":"Tongyi Lab, Alibaba Group"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhen","family":"Han","sequence":"additional","affiliation":[{"name":"Tongyi Lab, Alibaba Group"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chaojie","family":"Mao","sequence":"additional","affiliation":[{"name":"Tongyi Lab, Alibaba Group"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jingfeng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tongyi Lab, Alibaba Group"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yulin","family":"Pan","sequence":"additional","affiliation":[{"name":"Tongyi Lab, Alibaba Group"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yu","family":"Liu","sequence":"additional","affiliation":[{"name":"Tongyi Lab, Alibaba Group"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"KLING AI","year":"2025"},{"key":"ref2","year":"2022","journal-title":"Stable Diffusion v1.5 Model Card"},{"key":"ref3","volume-title":"Stable Diffusion Inpainting Model Card","year":"2022"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2929257"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00776"},{"key":"ref7","article-title":"PixArt- $\\alpha$: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis","volume-title":"arXiv preprint","author":"Chen","year":"2023"},{"key":"ref8","article-title":"Follow-Your-Canvas: Higher-Resolution Video Outpainting with Extensive Content Generation","author":"Chen","year":"2025","journal-title":"Assoc. Adv. Artif. Intell."},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02190"},{"key":"ref10","article-title":"Control-A-Video: Controllable Text-to-Video Diffusion Models with Motion Prior and Reward Feedback Learning","author":"Chen","year":"2023","journal-title":"arXiv preprint"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.00630"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.01166"},{"key":"ref13","volume-title":"Tongyi Wanxiang","year":"2023"},{"key":"ref14","article-title":"FLATTEN: Optical FLow-guided ATTENtion for consistent text-to-video editing","author":"Cong","year":"2024","journal-title":"Int. Conf. Learn. Represent."},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.00746"},{"key":"ref16","article-title":"Scaling Rectified Flow Transformers for High-Resolution Image Synthesis","volume-title":"Int. Conf. Mach. Learn.","author":"Esser","year":"2024"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612478"},{"key":"ref18","volume-title":"FLUX","year":"2024"},{"key":"ref19","article-title":"SEED-Data-Edit Technical Report: A Hybrid Dataset for Instructional Image Editing","author":"Ge","year":"2024","journal-title":"arXiv preprint"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657407"},{"key":"ref21","article-title":"PuLID: Pure and Lightning ID Customization via Contrastive Alignment","author":"Guo","year":"2024","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"ref22","article-title":"LTX-Video: Realtime Video Latent Diffusion","author":"HaCohen","year":"2025","journal-title":"arXiv preprint"},{"key":"ref23","article-title":"ACE: All-round Creator and Editor Following Instructions via Diffusion Transformer","author":"Han","year":"2025","journal-title":"Int. Conf. Learn. Represent."},{"key":"ref24","article-title":"Classifier-Free Diffusion Guidance","author":"Ho","year":"2021","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"ref25","article-title":"Denoising Diffusion Probabilistic Models","author":"Ho","year":"2020","journal-title":"Adv. Neural Inform. Process. Syst. Curran Associates, Inc."},{"key":"ref26","article-title":"Composer: Creative and Controllable Image Synthesis with Composable Conditions","volume-title":"Int. Conf. Mach. Learn.","author":"Huang","year":"2023"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"ref28","article-title":"VBench++: Comprehensive and Versatile Benchmark Suite for Video Generative Models","author":"Huang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1850"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00859"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"ref32","article-title":"HunyuanVideo: A Systematic Framework For Large Video Generative Models","author":"Kong","year":"2024","journal-title":"arXiv preprint"},{"key":"ref33","article-title":"HunyuanDiT: A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding","author":"Li","year":"2024","journal-title":"arXiv preprint"},{"key":"ref34","article-title":"MagicEdit: High-Fidelity and Temporally Coherent Video Editing","author":"Liew","year":"2023","journal-title":"arXiv preprint"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01387"},{"key":"ref36","article-title":"Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection","author":"Liu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00821"},{"key":"ref38","article-title":"Cones: Concept Neurons in Diffusion Models for Customized Generation","author":"Liu","year":"2023","journal-title":"Int. Conf. Mach. Learn."},{"key":"ref39","article-title":"Cones 2: Customizable Image Synthesis with Multiple Subjects","volume-title":"Adv. Neural Inform. Process. Syst.","author":"Liu","year":"2023"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28206"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW69036.2025.00207"},{"key":"ref42","article-title":"SDEdit: Guided Image Synthesis and Editing with Stochastic Differential Equations","volume-title":"Int. Conf. Learn. Represent.","author":"Meng","year":"2021"},{"key":"ref43","volume-title":"Midjourney","year":"2023"},{"key":"ref44","volume-title":"Hailuo AI Video","year":"2024"},{"key":"ref45","volume-title":"DALL.E 3","year":"2023"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591500"},{"key":"ref47","article-title":"Locate, Assign, Refine: Taming Customized Image Inpainting with Text-Subject Guidance","volume-title":"arXiv preprint","author":"Pan","year":"2024"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"ref49","volume-title":"PiKa","year":"2025"},{"key":"ref50","article-title":"UniControl: A Unified Diffusion Model for Controllable Visual Generation In the Wild","author":"Qin","year":"2023","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3019967"},{"key":"ref52","article-title":"SAM 2: Segment Anything in Images and Videos","author":"Ravi","year":"2025","journal-title":"Int. Conf. Learn. Represent."},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref54","article-title":"UNet: Convolutional Networks for Biomedical Image Segmentation","author":"Ronneberger","year":"2015","journal-title":"Med. Image Comput. Computer-Assisted Interv."},{"key":"ref55","volume-title":"Gen-3","year":"2025"},{"key":"ref56","article-title":"Denoising Diffusion Implicit Models","author":"Song","year":"2021","journal-title":"Int. Conf. Learn. Represent."},{"key":"ref57","article-title":"Score-Based Generative Modeling through Stochastic Differential Equations","author":"Song","year":"2021","journal-title":"Int. Conf. Learn. Represent."},{"key":"ref58","year":"2022","journal-title":"Stable Diffusion v2\u20131 Model Card"},{"key":"ref59","volume-title":"Stable Diffusion XL Model Card","year":"2022"},{"key":"ref60","volume-title":"CosXL Model Card","year":"2024"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2115"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00323"},{"key":"ref63","article-title":"OminiControl: Minimal and Universal Control for Diffusion Transformer","author":"Tan","year":"2024","journal-title":"arXiv preprint"},{"key":"ref64","year":"2025","journal-title":"Wan: Open and advanced large-scale video generative models"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_24"},{"key":"ref66","volume-title":"Vidu","year":"2025"},{"key":"ref67","article-title":"InstantID: Zero-shot Identity-Preserving Generation in Seconds","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0334"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657518"},{"key":"ref70","article-title":"DreamVideo-2: Zero-Shot Subject-Driven Video Customization with Precise Motion Control","author":"Wei","year":"2024","journal-title":"arXiv preprint"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.01241"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00455"},{"key":"ref73","article-title":"CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer","author":"Yang","year":"2025","journal-title":"Int. Conf. Learn. Represent."},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.01211"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1365"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"ref77","article-title":"I2VGen-XL: High-Quality Image-to-Video Synthesis via Cascaded Diffusion Models","volume-title":"arXiv preprint","author":"Zhang","year":"2023"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/cvprw63382.2024.00179"},{"key":"ref79","article-title":"ControlVideo: Training-free Controllable Text-to-Video Generation","author":"Zhang","year":"2024","journal-title":"Int. Conf. Learn. Represent."},{"key":"ref80","article-title":"Magic Mirror: ID- Preserved Video Generation in Video Diffusion Transformers","author":"Zhang","year":"2025","journal-title":"arXiv preprint"},{"key":"ref81","article-title":"UltraEdit: Instruction-based Fine-Grained Image Editing at Scale","author":"Zhao","year":"2024","journal-title":"arXiv preprint"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00961"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11446009.pdf?arnumber=11446009","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T04:57:41Z","timestamp":1777611461000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11446009\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":82,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01597","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}