{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:24:02Z","timestamp":1776889442166,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":67,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,10]]},"DOI":"10.1145\/3721238.3730673","type":"proceedings-article","created":{"date-parts":[[2025,7,23]],"date-time":"2025-07-23T08:40:47Z","timestamp":1753260047000},"page":"1-12","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["VideoPainter: Any-length Video Inpainting and Editing with Plug-and-Play Context Control"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-2362-2312","authenticated-orcid":false,"given":"Yuxuan","family":"Bian","sequence":"first","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5583-6454","authenticated-orcid":false,"given":"Zhaoyang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0668-1375","authenticated-orcid":false,"given":"Xuan","family":"Ju","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6577-4715","authenticated-orcid":false,"given":"Mingdeng","family":"Cao","sequence":"additional","affiliation":[{"name":"The University of Tokyo, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4920-232X","authenticated-orcid":false,"given":"Liangbin","family":"Xie","sequence":"additional","affiliation":[{"name":"University of Macau, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7673-8325","authenticated-orcid":false,"given":"Ying","family":"Shan","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6747-126X","authenticated-orcid":false,"given":"Qiang","family":"Xu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, Hong Kong"}]}],"member":"320","published-online":{"date-parts":[[2025,7,27]]},"reference":[{"key":"e_1_3_3_3_2_1","doi-asserted-by":"crossref","unstructured":"Omri Avrahami Ohad Fried and Dani Lischinski. 2023. Blended latent diffusion. ACM transactions on graphics (TOG) 42 4 (2023) 1\u201311.","DOI":"10.1145\/3592450"},{"key":"e_1_3_3_3_3_1","unstructured":"Jianhong Bai Tianyu He Yuchi Wang Junliang Guo Haoji Hu Zuozhu Liu and Jiang Bian. 2024. Uniedit: A unified tuning-free framework for video motion and appearance editing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.13185 (2024)."},{"key":"e_1_3_3_3_4_1","unstructured":"James Betker Gabriel Goh Li Jing Tim Brooks Jianfeng Wang Linjie Li Long Ouyang Juntang Zhuang Joyce Lee Yufei Guo et\u00a0al. 2023. Improving image generation with better captions. Computer Science. https:\/\/cdn. openai. com\/papers\/dall-e-3. pdf 2 3 (2023) 8."},{"key":"e_1_3_3_3_5_1","unstructured":"Yuxuan Bian Xuan Ju Jiangtong Li Zhijian Xu Dawei Cheng and Qiang Xu. 2024. Multi-patch prediction: Adapting llms for time series representation learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.04852 (2024)."},{"key":"e_1_3_3_3_6_1","unstructured":"Minghong Cai Xiaodong Cun Xiaoyu Li Wenze Liu Zhaoyang Zhang Yong Zhang Ying Shan and Xiangyu Yue. 2024. DiTCtrl: Exploring Attention Control in Multi-Modal Diffusion Transformer for Tuning-Free Multi-Prompt Longer Video Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.18597 (2024)."},{"key":"e_1_3_3_3_7_1","volume-title":"PySceneDetect: Intelligent Scene Cut Detection and Video Analysis Tool","author":"Castellano Brandon","year":"2024","unstructured":"Brandon Castellano. 2024. PySceneDetect: Intelligent Scene Cut Detection and Video Analysis Tool. https:\/\/github.com\/Breakthrough\/PySceneDetect"},{"key":"e_1_3_3_3_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00916"},{"key":"e_1_3_3_3_9_1","unstructured":"Ya-Liang Chang Zhe\u00a0Yu Liu Kuan-Ying Lee and Winston Hsu. 2019b. Learnable gated temporal shift module for deep video inpainting. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1907.01131 (2019)."},{"key":"e_1_3_3_3_10_1","unstructured":"Junsong Chen Jincheng Yu Chongjian Ge Lewei Yao Enze Xie Yue Wu Zhongdao Wang James Kwok Ping Luo Huchuan Lu and Zhenguo Li. 2023. PixArt-\u03b1 : Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis. arxiv:https:\/\/arXiv.org\/abs\/2310.00426\u00a0[cs.CV]"},{"key":"e_1_3_3_3_11_1","volume-title":"CogVideoX-LoRA-Wallace_and_Gromit","year":"2024","unstructured":"Cseti. 2024. CogVideoX-LoRA-Wallace_and_Gromit. Hugging Face. https:\/\/huggingface.co\/Cseti\/CogVideoX-LoRA-Wallace_and_Gromit"},{"key":"e_1_3_3_3_12_1","volume-title":"Proceedings of the Neural Information Processing Systems (NeurIPS) Track on Datasets and Benchmarks","author":"Darkhalil Ahmad","year":"2022","unstructured":"Ahmad Darkhalil, Dandan Shan, Bin Zhu, Jian Ma, Amlan Kar, Richard Higgins, Sanja Fidler, David Fouhey, and Dima Damen. 2022. EPIC-KITCHENS VISOR Benchmark: VIdeo Segmentations and Object Relations. In Proceedings of the Neural Information Processing Systems (NeurIPS) Track on Datasets and Benchmarks."},{"key":"e_1_3_3_3_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01850"},{"key":"e_1_3_3_3_14_1","unstructured":"Zixun Fang Wei Zhai Aimin Su Hongliang Song Kai Zhu Mao Wang Yu Chen Zhiheng Liu Yang Cao and Zheng-Jun Zha. 2024. ViViD: Video Virtual Try-on using Diffusion Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.11794 (2024)."},{"key":"e_1_3_3_3_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58610-2_42"},{"key":"e_1_3_3_3_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657407"},{"key":"e_1_3_3_3_17_1","unstructured":"Yuwei Guo Ceyuan Yang Anyi Rao Zhengyang Liang Yaohui Wang Yu Qiao Maneesh Agrawala Dahua Lin and Bo Dai. 2023. Animatediff: Animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.04725 (2023)."},{"key":"e_1_3_3_3_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01240"},{"key":"e_1_3_3_3_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58583-9_3"},{"key":"e_1_3_3_3_20_1","unstructured":"Xuan Ju Xian Liu Xintao Wang Yuxuan Bian Ying Shan and Qiang Xu. 2024. Brushnet: A plug-and-play image inpainting model with decomposed dual-branch diffusion. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.06976 (2024)."},{"key":"e_1_3_3_3_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00594"},{"key":"e_1_3_3_3_22_1","unstructured":"Kuaishou. 2024. KLING SPARK YOUR IMAGINATION. https:\/\/kling.kuaishou.com\/."},{"key":"e_1_3_3_3_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00451"},{"key":"e_1_3_3_3_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_42"},{"key":"e_1_3_3_3_25_1","unstructured":"Yaowei Li Yuxuan Bian Xuan Ju Zhaoyang Zhang Ying Shan and Qiang Xu. 2024. BrushEdit: All-In-One Image Inpainting and Editing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.10316 (2024)."},{"key":"e_1_3_3_3_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01704"},{"key":"e_1_3_3_3_27_1","unstructured":"Rui Liu Hanming Deng Yangyi Huang Xiaoyu Shi Lewei Lu Wenxiu Sun Xiaogang Wang Jifeng Dai and Hongsheng Li. 2021. Decoupled spatial-temporal transformer for video inpainting. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2104.06637 (2021)."},{"key":"e_1_3_3_3_28_1","unstructured":"Shilong Liu Zhaoyang Zeng Tianhe Ren Feng Li Hao Zhang Jie Yang Chunyuan Li Jianwei Yang Hang Su Jun Zhu et\u00a0al. 2023. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.05499 (2023)."},{"key":"e_1_3_3_3_29_1","unstructured":"Chong Mou Mingdeng Cao Xintao Wang Zhaoyang Zhang Ying Shan and Jian Zhang. 2024. ReVideo: Remake a Video with Motion and Content Control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.13865 (2024)."},{"key":"e_1_3_3_3_30_1","unstructured":"NVIDIA. 2025. NVIDIA Cosmos: Accelerate Physical AI Development with World Foundation Models. https:\/\/www.nvidia.com\/en-us\/ai\/cosmos\/"},{"key":"e_1_3_3_3_31_1","volume-title":"Hello GPT-4","year":"2024","unstructured":"OpenAI. 2024. Hello GPT-4. https:\/\/openai.com\/index\/hello-gpt-4o\/"},{"key":"e_1_3_3_3_32_1","unstructured":"OpenAI. 2024. Video generation models as world simulators. https:\/\/openai.com\/sora\/."},{"key":"e_1_3_3_3_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_3_3_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.85"},{"key":"e_1_3_3_3_35_1","unstructured":"Adam Polyak Amit Zohar Andrew Brown Andros Tjandra Animesh Sinha Ann Lee Apoorv Vyas Bowen Shi Chih-Yao Ma Ching-Yao Chuang et\u00a0al. 2024. Movie gen: A cast of media foundation models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.13720 (2024)."},{"key":"e_1_3_3_3_36_1","doi-asserted-by":"crossref","unstructured":"Weize Quan Jiaxi Chen Yanli Liu Dong-Ming Yan and Peter Wonka. 2024. Deep learning-based image and video inpainting: A survey. International Journal of Computer Vision 132 7 (2024) 2367\u20132400.","DOI":"10.1007\/s11263-023-01977-6"},{"key":"e_1_3_3_3_37_1","unstructured":"Nikhila Ravi Valentin Gabeur Yuan-Ting Hu Ronghang Hu Chaitanya Ryali Tengyu Ma Haitham Khedr Roman R\u00e4dle Chloe Rolland Laura Gustafson et\u00a0al. 2024. Sam 2: Segment anything in images and videos. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.00714 (2024)."},{"key":"e_1_3_3_3_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_3_39_1","unstructured":"Robin Rombach and Patrick Esser. 2022. Stable Diffusion 2 Inpainting. https:\/\/huggingface.co\/stabilityai\/stable-diffusion-2-inpainting."},{"key":"e_1_3_3_3_40_1","doi-asserted-by":"crossref","unstructured":"Christoph Schuhmann Romain Beaumont Richard Vencu Cade Gordon Ross Wightman Mehdi Cherti Theo Coombes Aarush Katta Clayton Mullis Mitchell Wortsman et\u00a0al. 2022. Laion-5b: An open large-scale dataset for training next generation image-text models. Advances in Neural Information Processing Systems 35 (2022) 25278\u201325294.","DOI":"10.52202\/068431-1833"},{"key":"e_1_3_3_3_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657497"},{"key":"e_1_3_3_3_42_1","unstructured":"Wenhao Sun Rong-Cheng Tu Jingyi Liao and Dacheng Tao. 2024. Diffusion model-based video editing: A survey. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.07111 (2024)."},{"key":"e_1_3_3_3_43_1","unstructured":"Gemini Team Petko Georgiev Ving\u00a0Ian Lei Ryan Burnell Libin Bai Anmol Gulati Garrett Tanzer Damien Vincent Zhufeng Pan Shibo Wang et\u00a0al. 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.05530 (2024)."},{"key":"e_1_3_3_3_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_24"},{"key":"e_1_3_3_3_45_1","volume-title":"CVPR","author":"Tokmakov Pavel","year":"2023","unstructured":"Pavel Tokmakov, Jie Li, and Adrien Gaidon. 2023. Breaking the \u201cObject\u201d in Video Object Segmentation. In CVPR."},{"key":"e_1_3_3_3_46_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33015232"},{"key":"e_1_3_3_3_47_1","volume-title":"Advances in Neural Information Processing Systems","author":"Wang Ting-Chun","year":"2018","unstructured":"Ting-Chun Wang, Ming-Yu Liu, Jun-Yan Zhu, Guilin Liu, Andrew Tao, Jan Kautz, and Bryan Catanzaro. 2018. Video-to-Video Synthesis. In Advances in Neural Information Processing Systems , S.\u00a0Bengio, H.\u00a0Wallach, H.\u00a0Larochelle, K.\u00a0Grauman, N.\u00a0Cesa-Bianchi, and R.\u00a0Garnett (Eds.), Vol.\u00a031. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2018\/file\/d86ea612dec96096c5e0fcc8dd42ab6d-Paper.pdf"},{"key":"e_1_3_3_3_48_1","unstructured":"Weihan Wang Qingsong Lv Wenmeng Yu Wenyi Hong Ji Qi Yan Wang Junhui Ji Zhuoyi Yang Lei Zhao Xixuan Song Jiazheng Xu Bin Xu Juanzi Li Yuxiao Dong Ming Ding and Jie Tang. 2023. CogVLM: Visual Expert for Pretrained Language Models. arxiv:https:\/\/arXiv.org\/abs\/2311.03079\u00a0[cs.CV]"},{"key":"e_1_3_3_3_49_1","unstructured":"Xiang Wang Hangjie Yuan Shiwei Zhang Dayou Chen Jiuniu Wang Yingya Zhang Yujun Shen Deli Zhao and Jingren Zhou. 2024. Videocomposer: Compositional video synthesis with motion controllability. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_3_3_50_1","doi-asserted-by":"crossref","unstructured":"Zhou Wang Alan\u00a0C Bovik Hamid\u00a0R Sheikh and Eero\u00a0P Simoncelli. 2004. Image quality assessment: from error visibility to structural similarity. IEEE Transactions on Image Processing 13 4 (2004) 600\u2013612.","DOI":"10.1109\/TIP.2003.819861"},{"key":"e_1_3_3_3_51_1","unstructured":"Wikipedia contributors. 2024a. Mean absolute error \u2014 Wikipedia The Free Encyclopedia. https:\/\/en.wikipedia.org\/wiki\/Mean_absolute_error"},{"key":"e_1_3_3_3_52_1","unstructured":"Wikipedia contributors. 2024b. Mean squared error \u2014 Wikipedia The Free Encyclopedia. https:\/\/en.wikipedia.org\/wiki\/Mean_squared_error"},{"key":"e_1_3_3_3_53_1","unstructured":"Wikipedia contributors. 2024c. Peak signal-to-noise ratio \u2014 Wikipedia The Free Encyclopedia. https:\/\/en.wikipedia.org\/w\/index.php?title=Peak_signal-to-noise_ratio&oldid=1210897995 [Online; accessed 4-March-2024]."},{"key":"e_1_3_3_3_54_1","unstructured":"Chenfei Wu Lun Huang Qianxi Zhang Binyang Li Lei Ji Fan Yang Guillermo Sapiro and Nan Duan. 2021. GODIVA: Generating open-domain videos from natural descriptions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2104.14806 (2021)."},{"key":"e_1_3_3_3_55_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_36"},{"key":"e_1_3_3_3_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00384"},{"key":"e_1_3_3_3_57_1","unstructured":"Zhuoyi Yang Jiayan Teng Wendi Zheng Ming Ding Shiyu Huang Jiazheng Xu Yuanming Yang Wenyi Hong Xiaohan Zhang Guanyu Feng et\u00a0al. 2024. CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.06072 (2024)."},{"key":"e_1_3_3_3_58_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_31"},{"key":"e_1_3_3_3_59_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19797-0_5"},{"key":"e_1_3_3_3_60_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19797-0_5"},{"key":"e_1_3_3_3_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00589"},{"key":"e_1_3_3_3_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_3_3_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_3_3_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00179"},{"key":"e_1_3_3_3_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00684"},{"key":"e_1_3_3_3_66_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-96530-3"},{"key":"e_1_3_3_3_67_1","unstructured":"Bojia Zi Shihao Zhao Xianbiao Qi Jianan Wang Yukai Shi Qianyu Chen Bin Liang Kam-Fai Wong and Lei Zhang. 2024. CoCoCo: Improving Text-Guided Video Inpainting for Better Consistency Controllability and Compatibility. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.12035 (2024)."},{"key":"e_1_3_3_3_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01618"}],"event":{"name":"SIGGRAPH Conference Papers '25: Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers","location":"Vancouver BC Canada","acronym":"SIGGRAPH Conference Papers '25","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721238.3730673","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T14:54:56Z","timestamp":1774018496000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721238.3730673"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,27]]},"references-count":67,"alternative-id":["10.1145\/3721238.3730673","10.1145\/3721238"],"URL":"https:\/\/doi.org\/10.1145\/3721238.3730673","relation":{},"subject":[],"published":{"date-parts":[[2025,7,27]]},"assertion":[{"value":"2025-07-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}