{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:26:57Z","timestamp":1765308417786,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755474","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:50:47Z","timestamp":1761371447000},"page":"4494-4503","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["AnchorSync: Global Consistency Optimization for Long Video Editing"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-7154-4220","authenticated-orcid":false,"given":"Zichi","family":"Liu","sequence":"first","affiliation":[{"name":"MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6686-6603","authenticated-orcid":false,"given":"Yinggui","family":"Wang","sequence":"additional","affiliation":[{"name":"Ant Group, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4027-0310","authenticated-orcid":false,"given":"Tao","family":"Wei","sequence":"additional","affiliation":[{"name":"Ant Group, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8459-2845","authenticated-orcid":false,"given":"Chao","family":"Ma","sequence":"additional","affiliation":[{"name":"MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01767"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1049\/ipr2.12008"},{"key":"e_1_3_2_2_3_1","unstructured":"Andreas Blattmann Tim Dockhorn Sumith Kulal Daniel Mendelevitch Maciej Kilian Dominik Lorenz Yam Levi Zion English Vikram Voleti Adam Letts et al. 2023. Stable video diffusion: Scaling latent video diffusion models to large datasets. arXiv:2311.15127 (2023)."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02121"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01265"},{"key":"e_1_3_2_2_7_1","first-page":"8780","article-title":"Diffusion models beat gans on image synthesis","volume":"34","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. Advances in Neural Information Processing Systems, Vol. 34 (2021), 8780-8794.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00641"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19784-0_6"},{"key":"e_1_3_2_2_10_1","volume-title":"Tokenflow: Consistent diffusion features for consistent video editing. arXiv:2307.10373","author":"Geyer Michal","year":"2023","unstructured":"Michal Geyer, Omer Bar-Tal, Shai Bagon, and Tali Dekel. 2023. Tokenflow: Consistent diffusion features for consistent video editing. arXiv:2307.10373 (2023)."},{"key":"e_1_3_2_2_11_1","volume-title":"HybridBooth: Hybrid Prompt Inversion for Efficient Subject-Driven Generation. In European Conference on Computer Vision. 403-419","author":"Guan Shanyan","year":"2025","unstructured":"Shanyan Guan, Yanhao Ge, Ying Tai, Jian Yang, Wei Li, and Mingyu You. 2025. HybridBooth: Hybrid Prompt Inversion for Efficient Subject-Driven Generation. In European Conference on Computer Vision. 403-419."},{"key":"e_1_3_2_2_12_1","volume-title":"Freestyle: Free lunch for text-guided style transfer using diffusion models. arXiv:2401.15636","author":"He Feihong","year":"2024","unstructured":"Feihong He, Gang Li, Mengyuan Zhang, Leilei Yan, Lingyu Si, Fanzhang Li, and Li Shen. 2024a. Freestyle: Free lunch for text-guided style transfer using diffusion models. arXiv:2401.15636 (2024)."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2023.102984"},{"key":"e_1_3_2_2_15_1","volume-title":"Prompt-to-prompt image editing with cross attention control. arXiv:2208.01626","author":"Hertz Amir","year":"2022","unstructured":"Amir Hertz, Ron Mokady, Jay Tenenbaum, Kfir Aberman, Yael Pritch, and Daniel Cohen-Or. 2022a. Prompt-to-prompt image editing with cross attention control. arXiv:2208.01626 (2022)."},{"key":"e_1_3_2_2_16_1","volume-title":"Prompt-to-prompt image editing with cross attention control. arXiv:2208.01626","author":"Hertz Amir","year":"2022","unstructured":"Amir Hertz, Ron Mokady, Jay Tenenbaum, Kfir Aberman, Yael Pritch, and Daniel Cohen-Or. 2022b. Prompt-to-prompt image editing with cross attention control. arXiv:2208.01626 (2022)."},{"key":"e_1_3_2_2_17_1","volume-title":"International Conference on Learning Representations","volume":"1","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2022. Lora: Low-rank adaptation of large language models. International Conference on Learning Representations, Vol. 1, 2 (2022), 3."},{"key":"e_1_3_2_2_18_1","volume-title":"Cocktail: Mixing multi-modality control for text-conditional image generation. In Advances in Neural Information Processing Systems.","author":"Hu Minghui","year":"2023","unstructured":"Minghui Hu, Jianbin Zheng, Daqing Liu, Chuanxia Zheng, Chaoyue Wang, Dacheng Tao, and Tat-Jen Cham. 2023. Cocktail: Mixing multi-modality control for text-conditional image generation. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_2_19_1","volume-title":"Synthetic data augmentation for surface defect detection and classification using deep learning. Journal of Intelligent Manufacturing","author":"Jain Saksham","year":"2022","unstructured":"Saksham Jain, Gautam Seth, Arpit Paruthi, Umang Soni, and Girish Kumar. 2022. Synthetic data augmentation for surface defect detection and classification using deep learning. Journal of Intelligent Manufacturing (2022), 1-14."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3323006"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"e_1_3_2_2_22_1","volume-title":"Adam: A method for stochastic optimization. arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma. 2014. Adam: A method for stochastic optimization. arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_2_23_1","volume-title":"Anyv2v: A plug-and-play framework for any video-to-video editing tasks. arXiv:2403.14468","author":"Ku Max","year":"2024","unstructured":"Max Ku, Cong Wei, Weiming Ren, Huan Yang, and Wenhu Chen. 2024. Anyv2v: A plug-and-play framework for any video-to-video editing tasks. arXiv:2403.14468 (2024)."},{"key":"e_1_3_2_2_24_1","first-page":"16240","article-title":"Collaborative Video Diffusion: Consistent Multi-video Generation with Camera Control","volume":"37","author":"Kuang Zhengfei","year":"2024","unstructured":"Zhengfei Kuang, Shengqu Cai, Hao He, Yinghao Xu, Hongsheng Li, Leonidas Guibas, and Gordon Wetzstein. 2024. Collaborative Video Diffusion: Consistent Multi-video Generation with Camera Control. Advances in Neural Information Processing Systems, Vol. 37 (2024), 16240-16271.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_25_1","volume-title":"Diffusion-based image translation using disentangled style and content representation. arXiv:2209.15264","author":"Kwon Gihyun","year":"2022","unstructured":"Gihyun Kwon and Jong Chul Ye. 2022. Diffusion-based image translation using disentangled style and content representation. arXiv:2209.15264 (2022)."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_11"},{"key":"e_1_3_2_2_27_1","volume-title":"VidToMe: Video Token Merging for Zero-Shot Video Editing. In IEEE Conference on Computer Vision and Pattern Recognition. 7486-7495","author":"Li Xirui","year":"2024","unstructured":"Xirui Li, Chao Ma, Xiaokang Yang, and Ming-Hsuan Yang. 2024. VidToMe: Video Token Merging for Zero-Shot Video Editing. In IEEE Conference on Computer Vision and Pattern Recognition. 7486-7495."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3715014.3722084"},{"key":"e_1_3_2_2_29_1","volume-title":"Looking Backward: Streaming Video-to-Video Translation with Feature Banks. arXiv:2405.15757","author":"Liang Feng","year":"2024","unstructured":"Feng Liang, Akio Kodaira, Chenfeng Xu, Masayoshi Tomizuka, Kurt Keutzer, and Diana Marculescu. 2024a. Looking Backward: Streaming Video-to-Video Translation with Feature Banks. arXiv:2405.15757 (2024)."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00784"},{"key":"e_1_3_2_2_31_1","volume-title":"Keyvideollm: Towards large-scale video keyframe selection. arXiv:2407.03104","author":"Liang Hao","year":"2024","unstructured":"Hao Liang, Jiapeng Li, Tianyi Bai, Xijie Huang, Linzhuang Sun, Zhengren Wang, Conghui He, Bin Cui, Chong Chen, and Wentao Zhang. 2024b. Keyvideollm: Towards large-scale video keyframe selection. arXiv:2407.03104 (2024)."},{"key":"e_1_3_2_2_32_1","volume-title":"Towards Understanding Cross and Self-Attention in Stable Diffusion for Text-Guided Image Editing. In IEEE Conference on Computer Vision and Pattern Recognition. 7817-7826","author":"Liu Bingyan","year":"2024","unstructured":"Bingyan Liu, Chengyu Wang, Tingfeng Cao, Kui Jia, and Jun Huang. 2024. Towards Understanding Cross and Self-Attention in Stable Diffusion for Text-Guided Image Editing. In IEEE Conference on Computer Vision and Pattern Recognition. 7817-7826."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_26"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/JBHI.2020.3019198"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"e_1_3_2_2_36_1","first-page":"18481","article-title":"ReVideo: Remake a Video with Motion and Content Control","volume":"37","author":"Mou Chong","year":"2024","unstructured":"Chong Mou, Mingdeng Cao, Xintao Wang, Zhaoyang Zhang, Ying Shan, and Jian Zhang. 2024. ReVideo: Remake a Video with Motion and Content Control. Advances in Neural Information Processing Systems, Vol. 37 (2024), 18481-18505.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_37_1","volume-title":"Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv:2112.10741","author":"Nichol Alex","year":"2021","unstructured":"Alex Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob McGrew, Ilya Sutskever, and Mark Chen. 2021. Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv:2112.10741 (2021)."},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10845-022-02068-y"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687656"},{"key":"e_1_3_2_2_40_1","volume-title":"Richard Zhang, Yijun Li, Jingwan Lu, and Jun-Yan Zhu.","author":"Parmar Gaurav","year":"2023","unstructured":"Gaurav Parmar, Krishna Kumar Singh, Richard Zhang, Yijun Li, Jingwan Lu, and Jun-Yan Zhu. 2023. Zero-shot image-to-image translation. In Association for Computing Machinery Special Interest Group on Computer Graphics and Interactive Techniques. 1-11."},{"key":"e_1_3_2_2_41_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv:2204.06125, Vol. 1, 2 (2022), 3."},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_2_43_1","volume-title":"DreamMover: Leveraging the Prior of Diffusion Models for Image Interpolation with Large Motion. In European Conference on Computer Vision. Springer, 336-353","author":"Shen Liao","year":"2024","unstructured":"Liao Shen, Tianqi Liu, Huiqiang Sun, Xinyi Ye, Baopu Li, Jianming Zhang, and Zhiguo Cao. 2024. DreamMover: Leveraging the Prior of Diffusion Models for Image Interpolation with Large Motion. In European Conference on Computer Vision. Springer, 336-353."},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/s40747-021-00569-6"},{"key":"e_1_3_2_2_45_1","volume-title":"International Conference on Machine Learning. 2256-2265","author":"Sohl-Dickstein Jascha","year":"2015","unstructured":"Jascha Sohl-Dickstein, Eric Weiss, Niru Maheswaranathan, and Surya Ganguli. 2015. Deep unsupervised learning using nonequilibrium thermodynamics. In International Conference on Machine Learning. 2256-2265."},{"key":"e_1_3_2_2_46_1","volume-title":"Score-based generative modeling through stochastic differential equations. arXiv:2011.13456","author":"Song Yang","year":"2020","unstructured":"Yang Song, Jascha Sohl-Dickstein, Diederik P Kingma, Abhishek Kumar, Stefano Ermon, and Ben Poole. 2020. Score-based generative modeling through stochastic differential equations. arXiv:2011.13456 (2020)."},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00191"},{"key":"e_1_3_2_2_48_1","volume-title":"Gen-l-video: Multi-text to long video generation via temporal co-denoising. arXiv:2305.18264","author":"Wang Fu-Yun","year":"2023","unstructured":"Fu-Yun Wang, Wenshuo Chen, Guanglu Song, Han-Jia Ye, Yu Liu, and Hongsheng Li. 2023a. Gen-l-video: Multi-text to long video generation via temporal co-denoising. arXiv:2305.18264 (2023)."},{"key":"e_1_3_2_2_49_1","volume-title":"Zero-shot video editing using off-the-shelf image diffusion models. arXiv:2303.17599","author":"Wang Wen","year":"2023","unstructured":"Wen Wang, Kangyang Xie, Zide Liu, Hao Chen, Yue Cao, Xinlong Wang, and Chunhua Shen. 2023b. Zero-shot video editing using off-the-shelf image diffusion models. arXiv:2303.17599 (2023)."},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"e_1_3_2_2_51_1","volume-title":"Fastcomposer: Tuning-free multi-subject image generation with localized attention. International Journal of Computer Vision","author":"Xiao Guangxuan","year":"2024","unstructured":"Guangxuan Xiao, Tianwei Yin, William T Freeman, Fr\u00e9do Durand, and Song Han. 2024. Fastcomposer: Tuning-free multi-subject image generation with localized attention. International Journal of Computer Vision (2024), 1-20."},{"key":"e_1_3_2_2_52_1","volume-title":"Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation. In SIGGRAPH Asia 2023 Conference Papers. 1-11","author":"Yang Shuai","year":"2023","unstructured":"Shuai Yang, Yifan Zhou, Ziwei Liu, and Chen Change Loy. 2023. Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation. In SIGGRAPH Asia 2023 Conference Papers. 1-11."},{"key":"e_1_3_2_2_53_1","volume-title":"Dino: Detr with improved denoising anchor boxes for end-to-end object detection. arXiv:2203.03605","author":"Zhang Hao","year":"2022","unstructured":"Hao Zhang, Feng Li, Shilong Liu, Lei Zhang, Hang Su, Jun Zhu, Lionel M Ni, and Heung-Yeung Shum. 2022. Dino: Detr with improved denoising anchor boxes for end-to-end object detection. arXiv:2203.03605 (2022)."},{"key":"e_1_3_2_2_54_1","volume-title":"DiffMorpher: Unleashing the Capability of Diffusion Models for Image Morphing. In IEEE Conference on Computer Vision and Pattern Recognition. 7912-7921","author":"Zhang Kaiwen","year":"2024","unstructured":"Kaiwen Zhang, Yifan Zhou, Xudong Xu, Bo Dai, and Xingang Pan. 2024b. DiffMorpher: Unleashing the Capability of Diffusion Models for Image Morphing. In IEEE Conference on Computer Vision and Pattern Recognition. 7912-7921."},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00978"},{"key":"e_1_3_2_2_57_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Zhang Zicheng","year":"2024","unstructured":"Zicheng Zhang, Bonan Li, Xuecheng Nie, Congying Han, Tiande Guo, and Luoqi Liu. 2024a. Towards consistent video editing with text-to-image diffusion models. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_2_58_1","volume-title":"Leveraging the Power of Data Augmentation for Transformer-based Tracking. In IEEE\/CVF Winter Conference on Applications of Computer Vision. 6469-6478","author":"Zhao Jie","year":"2024","unstructured":"Jie Zhao, Johan Edstedt, Michael Felsberg, Dong Wang, and Huchuan Lu. 2024. Leveraging the Power of Data Augmentation for Transformer-based Tracking. In IEEE\/CVF Winter Conference on Applications of Computer Vision. 6469-6478."},{"key":"e_1_3_2_2_59_1","volume-title":"UK","author":"Zoph Barret","year":"2020","unstructured":"Barret Zoph, Ekin D Cubuk, Golnaz Ghiasi, Tsung-Yi Lin, Jonathon Shlens, and Quoc V Le. 2020. Learning data augmentation strategies for object detection. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XXVII 16. 566-583."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755474","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:23:33Z","timestamp":1765308213000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755474"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":59,"alternative-id":["10.1145\/3746027.3755474","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755474","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}