{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:46:53Z","timestamp":1778082413035,"version":"3.51.4"},"publisher-location":"Cham","reference-count":61,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031734038","type":"print"},{"value":"9783031734045","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T00:00:00Z","timestamp":1730246400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T00:00:00Z","timestamp":1730246400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73404-5_21","type":"book-chapter","created":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T16:03:13Z","timestamp":1730217793000},"page":"358-376","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["DreamMotion: Space-Time Self-similar Score Distillation for\u00a0Zero-Shot Video Editing"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6864-4190","authenticated-orcid":false,"given":"Hyeonho","family":"Jeong","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7426-8304","authenticated-orcid":false,"given":"Jinho","family":"Chang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7522-4553","authenticated-orcid":false,"given":"Geon Yeong","family":"Park","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9763-9609","authenticated-orcid":false,"given":"Jong Chul","family":"Ye","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,30]]},"reference":[{"key":"21_CR1","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: a joint video and image encoder for end-to-end retrieval. In: IEEE International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"21_CR2","doi-asserted-by":"crossref","unstructured":"Blattmann, A., et al.: Align your latents: high-resolution video synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22563\u201322575 (2023)","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"21_CR3","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"21_CR4","doi-asserted-by":"crossref","unstructured":"Ceylan, D., Huang, C.H.P., Mitra, N.J.: Pix2Video: video editing using image diffusion. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 23206\u201323217 (2023)","DOI":"10.1109\/ICCV51070.2023.02121"},{"key":"21_CR5","unstructured":"Chen, W., et al.: Control-A-Video: controllable text-to-video generation with diffusion models. arXiv preprint arXiv:2305.13840 (2023)"},{"key":"21_CR6","unstructured":"Cong, Y., et al.: Flatten: optical flow-guided attention for consistent text-to-video editing. arXiv preprint arXiv:2310.05922 (2023)"},{"key":"21_CR7","doi-asserted-by":"crossref","unstructured":"Esser, P., Chiu, J., Atighehchian, P., Granskog, J., Germanidis, A.: Structure and content-guided video synthesis with diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7346\u20137356 (2023)","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"21_CR8","unstructured":"Geyer, M., Bar-Tal, O., Bagon, S., Dekel, T.: TokenFlow: consistent diffusion features for consistent video editing. arXiv preprint arXiv:2307.10373 (2023)"},{"key":"21_CR9","doi-asserted-by":"crossref","unstructured":"Hertz, A., Aberman, K., Cohen-Or, D.: Delta denoising score. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2328\u20132337 (2023)","DOI":"10.1109\/ICCV51070.2023.00221"},{"key":"21_CR10","doi-asserted-by":"crossref","unstructured":"Hessel, J., Holtzman, A., Forbes, M., Bras, R.L., Choi, Y.: ClipScore: a reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"21_CR11","unstructured":"Ho, J., et\u00a0al.: Imagen video: high definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)"},{"key":"21_CR12","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"21_CR13","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)"},{"key":"21_CR14","unstructured":"Ho, J., Salimans, T., Gritsenko, A., Chan, W., Norouzi, M., Fleet, D.J.: Video diffusion models. arXiv:2204.03458 (2022)"},{"key":"21_CR15","unstructured":"Hu, Z., Xu, D.: VideoControlNet: a motion-guided video-to-video translation framework by using diffusion model with ControlNet. arXiv preprint arXiv:2307.14073 (2023)"},{"issue":"4","key":"21_CR16","first-page":"695","volume":"6","author":"A Hyv\u00e4rinen","year":"2005","unstructured":"Hyv\u00e4rinen, A., Dayan, P.: Estimation of non-normalized statistical models by score matching. J. Mach. Learn. Res. 6(4), 695\u2013709 (2005)","journal-title":"J. Mach. Learn. Res."},{"key":"21_CR17","doi-asserted-by":"crossref","unstructured":"Jeong, H., Park, G.Y., Ye, J.C.: VMC: video motion customization using temporal attention adaption for text-to-video diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9212\u20139221 (2024)","DOI":"10.1109\/CVPR52733.2024.00880"},{"key":"21_CR18","unstructured":"Jeong, H., Ye, J.C.: Ground-A-Video: zero-shot grounded video editing using text-to-image diffusion models. In: The Twelfth International Conference on Learning Representations (2023)"},{"key":"21_CR19","doi-asserted-by":"crossref","unstructured":"Khachatryan, L., et al.: Text2Video-zero: text-to-image diffusion models are zero-shot video generators. arXiv preprint arXiv:2303.13439 (2023)","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"21_CR20","unstructured":"Kim, S., Lee, K., Choi, J.S., Jeong, J., Sohn, K., Shin, J.: Collaborative score distillation for consistent visual synthesis. arXiv preprint arXiv:2307.04787 (2023)"},{"key":"21_CR21","doi-asserted-by":"crossref","unstructured":"Kolkin, N., Salavon, J., Shakhnarovich, G.: Style transfer by relaxed optimal transport and self-similarity. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10051\u201310060 (2019)","DOI":"10.1109\/CVPR.2019.01029"},{"key":"21_CR22","unstructured":"Kwon, G., Ye, J.C.: Diffusion-based image translation using disentangled style and content representation. arXiv preprint arXiv:2209.15264 (2022)"},{"key":"21_CR23","doi-asserted-by":"crossref","unstructured":"Li, L.H., et\u00a0al.: Grounded language-image pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10965\u201310975 (2022)","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"21_CR24","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: GLIGEN: open-set grounded text-to-image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22511\u201322521 (2023)","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"21_CR25","doi-asserted-by":"crossref","unstructured":"Lin, C.H., et al.: Magic3D: high-resolution text-to-3D content creation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 300\u2013309 (2023)","DOI":"10.1109\/CVPR52729.2023.00037"},{"key":"21_CR26","doi-asserted-by":"crossref","unstructured":"Liu, S., Zhang, Y., Li, W., Lin, Z., Jia, J.: Video-P2P: video editing with cross-attention control. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8599\u20138608 (2024)","DOI":"10.1109\/CVPR52733.2024.00821"},{"key":"21_CR27","doi-asserted-by":"crossref","unstructured":"Liu, S., et\u00a0al.: Grounding DINO: marrying DINO with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499 (2023)","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"21_CR28","doi-asserted-by":"crossref","unstructured":"Nam, H., Kwon, G., Park, G.Y., Ye, J.C.: Contrastive denoising score for text-guided latent diffusion image editing. arXiv preprint arXiv:2311.18608 (2023)","DOI":"10.1109\/CVPR52733.2024.00878"},{"key":"21_CR29","doi-asserted-by":"crossref","unstructured":"Park, G.Y., Jeong, H., Lee, S.W., Ye, J.C.: Spectral motion alignment for video motion transfer using diffusion models. arXiv preprint arXiv:2403.15249 (2024)","DOI":"10.1609\/aaai.v39i6.32685"},{"key":"21_CR30","unstructured":"Podell, D., et al.: SDXL: improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)"},{"key":"21_CR31","unstructured":"Pont-Tuset, J., Perazzi, F., Caelles, S., Arbel\u00e1ez, P., Sorkine-Hornung, A., Van\u00a0Gool, L.: The 2017 Davis challenge on video object segmentation. arXiv preprint arXiv:1704.00675 (2017)"},{"key":"21_CR32","unstructured":"Poole, B., Jain, A., Barron, J.T., Mildenhall, B.: DreamFusion: text-to-3D using 2D diffusion. arXiv preprint arXiv:2209.14988 (2022)"},{"key":"21_CR33","doi-asserted-by":"crossref","unstructured":"Qi, C., et al.: FateZero: fusing attentions for zero-shot text-based video editing. arXiv preprint arXiv:2303.09535 (2023)","DOI":"10.1109\/ICCV51070.2023.01460"},{"key":"21_CR34","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"21_CR35","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125, 1(2), 3 (2022)"},{"key":"21_CR36","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"21_CR37","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"21_CR38","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., et al.: LAION-5B: an open large-scale dataset for training next generation image-text models. Adv. Neural. Inf. Process. Syst. 35, 25278\u201325294 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"21_CR39","doi-asserted-by":"crossref","unstructured":"Shechtman, E., Irani, M.: Matching local self-similarities across images and videos. In: 2007 IEEE Conference on Computer Vision and Pattern Recognition, pp.\u00a01\u20138. IEEE (2007)","DOI":"10.1109\/CVPR.2007.383198"},{"key":"21_CR40","unstructured":"Singer, U., et\u00a0al.: Make-A-Video: text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792 (2022)"},{"key":"21_CR41","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: International Conference on Machine Learning, pp. 2256\u20132265. PMLR (2015)"},{"key":"21_CR42","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)"},{"key":"21_CR43","unstructured":"Song, Y., Ermon, S.: Generative modeling by estimating gradients of the data distribution. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"21_CR44","unstructured":"Song, Y., Sohl-Dickstein, J., Kingma, D.P., Kumar, A., Ermon, S., Poole, B.: Score-based generative modeling through stochastic differential equations. arXiv preprint arXiv:2011.13456 (2020)"},{"key":"21_CR45","unstructured":"Sterling, S.: Zeroscope (2023). https:\/\/huggingface.co\/cerspense\/zeroscope_v2_576w"},{"key":"21_CR46","unstructured":"Tang, L., Jia, M., Wang, Q., Phoo, C.P., Hariharan, B.: Emergent correspondence from image diffusion. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"21_CR47","doi-asserted-by":"crossref","unstructured":"Tumanyan, N., Bar-Tal, O., Bagon, S., Dekel, T.: Splicing ViT features for semantic appearance transfer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10748\u201310757 (2022)","DOI":"10.1109\/CVPR52688.2022.01048"},{"issue":"7","key":"21_CR48","doi-asserted-by":"publisher","first-page":"1661","DOI":"10.1162\/NECO_a_00142","volume":"23","author":"P Vincent","year":"2011","unstructured":"Vincent, P.: A connection between score matching and denoising autoencoders. Neural Comput. 23(7), 1661\u20131674 (2011)","journal-title":"Neural Comput."},{"key":"21_CR49","unstructured":"Wang, J., Yuan, H., Chen, D., Zhang, Y., Wang, X., Zhang, S.: ModelScope text-to-video technical report. arXiv preprint arXiv:2308.06571 (2023)"},{"key":"21_CR50","unstructured":"Wang, W., et al.: Zero-shot video editing using off-the-shelf image diffusion models. arXiv preprint arXiv:2303.17599 (2023)"},{"key":"21_CR51","unstructured":"Wang, Y., et\u00a0al.: LAViE: high-quality video generation with cascaded latent diffusion models. arXiv preprint arXiv:2309.15103 (2023)"},{"key":"21_CR52","unstructured":"Wang, Z., et al.: ProlificDreamer: high-fidelity and diverse text-to-3D generation with variational score distillation. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"21_CR53","doi-asserted-by":"crossref","unstructured":"Wei, Y., et al.: DreamVideo: composing your dream videos with customized subject and motion. arXiv preprint arXiv:2312.04433 (2023)","DOI":"10.1109\/CVPR52733.2024.00625"},{"key":"21_CR54","doi-asserted-by":"crossref","unstructured":"Wu, J.Z., et al.: Tune-a-video: one-shot tuning of image diffusion models for text-to-video generation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7623\u20137633 (2023)","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"21_CR55","doi-asserted-by":"crossref","unstructured":"Wu, R., Chen, L., Yang, T., Guo, C., Li, C., Zhang, X.: LAMP: learn a motion pattern for few-shot-based video generation. arXiv preprint arXiv:2310.10769 (2023)","DOI":"10.1109\/CVPR52733.2024.00677"},{"key":"21_CR56","doi-asserted-by":"crossref","unstructured":"Yatim, D., Fridman, R., Tal, O.B., Kasten, Y., Dekel, T.: Space-time diffusion features for zero-shot text-driven motion transfer. arXiv preprint arXiv:2311.17009 (2023)","DOI":"10.1109\/CVPR52733.2024.00809"},{"key":"21_CR57","doi-asserted-by":"crossref","unstructured":"Zhang, D.J., et al.: Show-1: marrying pixel and latent diffusion models for text-to-video generation. arXiv preprint arXiv:2309.15818 (2023)","DOI":"10.1007\/s11263-024-02271-9"},{"key":"21_CR58","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3836\u20133847 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"21_CR59","unstructured":"Zhang, Y., Wei, Y., Jiang, D., Zhang, X., Zuo, W., Tian, Q.: ControlVideo: training-free controllable text-to-video generation. arXiv preprint arXiv:2305.13077 (2023)"},{"key":"21_CR60","unstructured":"Zhang, Y., et al.: MotionCrafter: one-shot motion customization of diffusion models. arXiv preprint arXiv:2312.05288 (2023)"},{"key":"21_CR61","unstructured":"Zhao, R., et al.: MotionDirector: motion customization of text-to-video diffusion models. arXiv preprint arXiv:2310.08465 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73404-5_21","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,24]],"date-time":"2025-04-24T19:44:56Z","timestamp":1745523896000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73404-5_21"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,30]]},"ISBN":["9783031734038","9783031734045"],"references-count":61,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73404-5_21","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,30]]},"assertion":[{"value":"30 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"Our work is based on generative models that carry the risk of being repurposed for unethical uses, such as misleading content.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics Statement"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}