{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,27]],"date-time":"2025-09-27T10:28:26Z","timestamp":1758968906864,"version":"3.40.3"},"publisher-location":"Cham","reference-count":53,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031729881"},{"type":"electronic","value":"9783031729898"}],"license":[{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72989-8_3","type":"book-chapter","created":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T17:02:04Z","timestamp":1729875724000},"page":"41-57","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["SAVE: Protagonist Diversification with\u00a0Structure Agnostic Video Editing"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5436-5801","authenticated-orcid":false,"given":"Yeji","family":"Song","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7458-9579","authenticated-orcid":false,"given":"Wonsik","family":"Shin","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3467-7855","authenticated-orcid":false,"given":"Junsoo","family":"Lee","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5299-5765","authenticated-orcid":false,"given":"Jeesoo","family":"Kim","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1792-0327","authenticated-orcid":false,"given":"Nojun","family":"Kwak","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,26]]},"reference":[{"key":"3_CR1","doi-asserted-by":"crossref","unstructured":"Avrahami, O., Aberman, K., Fried, O., Cohen-Or, D., Lischinski, D.: Break-a-scene: extracting multiple concepts from a single image. arXiv preprint arXiv:2305.16311 (2023)","DOI":"10.1145\/3610548.3618154"},{"key":"3_CR2","unstructured":"Bai, J., et al.: Uniedit: a unified tuning-free framework for video motion and appearance editing. arXiv preprint arXiv:2402.13185 (2024)"},{"key":"3_CR3","doi-asserted-by":"publisher","unstructured":"Bar-Tal, O., Ofri-Amar, D., Fridman, R., Kasten, Y., Dekel, T.: Text2live: text-driven layered image and video editing. In: European Conference on Computer Vision, pp. 707\u2013723. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19784-0_41","DOI":"10.1007\/978-3-031-19784-0_41"},{"key":"3_CR4","doi-asserted-by":"crossref","unstructured":"Blattmann, A., et al.: Align your latents: high-resolution video synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22563\u201322575 (2023)","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"3_CR5","doi-asserted-by":"crossref","unstructured":"Chen, H., et al.: Videocrafter2: overcoming data limitations for high-quality video diffusion models (2024)","DOI":"10.1109\/CVPR52733.2024.00698"},{"key":"3_CR6","doi-asserted-by":"crossref","unstructured":"Esser, P., Chiu, J., Atighehchian, P., Granskog, J., Germanidis, A.: Structure and content-guided video synthesis with diffusion models. arXiv preprint arXiv:2302.03011 (2023)","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"3_CR7","unstructured":"Gal, R., et al.: An image is worth one word: personalizing text-to-image generation using textual inversion (2023)"},{"key":"3_CR8","unstructured":"Geyer, M., Bar-Tal, O., Bagon, S., Dekel, T.: Tokenflow: consistent diffusion features for consistent video editing. arXiv preprint arXiv:2307.10373 (2023)"},{"key":"3_CR9","unstructured":"Guo, Y., et al.: Animatediff: animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:2307.04725 (2023)"},{"key":"3_CR10","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., Cohen-Or, D.: Prompt-to-prompt image editing with cross attention control (2023)"},{"key":"3_CR11","doi-asserted-by":"crossref","unstructured":"Hessel, J., Holtzman, A., Forbes, M., Bras, R.L., Choi, Y.: Clipscore: a reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"3_CR12","unstructured":"Ho, J., et\u00a0al.: Imagen video: high definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)"},{"key":"3_CR13","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3_CR14","unstructured":"Hong, W., Ding, M., Zheng, W., Liu, X., Tang, J.: Cogvideo: large-scale pretraining for text-to-video generation via transformers. arXiv preprint arXiv:2205.15868 (2022)"},{"key":"3_CR15","unstructured":"Huang, Z., Wu, T., Jiang, Y., Chan, K.C., Liu, Z.: Reversion: diffusion-based relation inversion from images. arXiv preprint arXiv:2303.13495 (2023)"},{"key":"3_CR16","unstructured":"Jin, Y., et al.: Video-lavit: unified video-language pre-training with decoupled visual-motional tokenization (2024)"},{"key":"3_CR17","doi-asserted-by":"crossref","unstructured":"Kara, O., Kurtkaya, B., Yesiltepe, H., Rehg, J.M., Yanardag, P.: Rave: randomized noise shuffling for fast and consistent video editing with diffusion models (2023)","DOI":"10.1109\/CVPR52733.2024.00622"},{"key":"3_CR18","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. arXiv preprint arXiv:2304.02643 (2023)"},{"key":"3_CR19","doi-asserted-by":"crossref","unstructured":"Kumari, N., Zhang, B., Zhang, R., Shechtman, E., Zhu, J.Y.: Multi-concept customization of text-to-image diffusion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1931\u20131941 (2023)","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"3_CR20","doi-asserted-by":"crossref","unstructured":"Li, X., Ma, C., Yang, X., Yang, M.H.: Vidtome: video token merging for zero-shot video editing (2023)","DOI":"10.1109\/CVPR52733.2024.00715"},{"key":"3_CR21","doi-asserted-by":"crossref","unstructured":"Liang, F., et al.: Flowvid: taming imperfect optical flows for consistent video-to-video synthesis (2023)","DOI":"10.1109\/CVPR52733.2024.00784"},{"key":"3_CR22","doi-asserted-by":"crossref","unstructured":"Liu, S., Zhang, Y., Li, W., Lin, Z., Jia, J.: Video-p2p: video editing with cross-attention control. arXiv preprint arXiv:2303.04761 (2023)","DOI":"10.1109\/CVPR52733.2024.00821"},{"key":"3_CR23","unstructured":"Ma, X., et al.: Latte: latent diffusion transformer for video generation. arXiv preprint arXiv:2401.03048 (2024)"},{"key":"3_CR24","unstructured":"Ma, Z., et al.: Magic-me: identity-specific video customized diffusion (2024)"},{"key":"3_CR25","doi-asserted-by":"crossref","unstructured":"Mokady, R., Hertz, A., Aberman, K., Pritch, Y., Cohen-Or, D.: Null-text inversion for editing real images using guided diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6038\u20136047 (2023)","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"3_CR26","unstructured":"Molad, E., et al.: Dreamix: video diffusion models are general video editors. arXiv preprint arXiv:2302.01329 (2023)"},{"key":"3_CR27","unstructured":"Nichol, A., et al.: Glide: towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)"},{"key":"3_CR28","doi-asserted-by":"crossref","unstructured":"Perazzi, F., Pont-Tuset, J., McWilliams, B., Van\u00a0Gool, L., Gross, M., Sorkine-Hornung, A.: A benchmark dataset and evaluation methodology for video object segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 724\u2013732 (2016)","DOI":"10.1109\/CVPR.2016.85"},{"key":"3_CR29","doi-asserted-by":"crossref","unstructured":"Qi, C., et al.: Fatezero: fusing attentions for zero-shot text-based video editing (2023)","DOI":"10.1109\/ICCV51070.2023.01460"},{"key":"3_CR30","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.061251(2), 3 (2022)"},{"key":"3_CR31","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"3_CR32","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: Dreambooth: fine tuning text-to-image diffusion models for subject-driven generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22500\u201322510 (2023)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"3_CR33","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3_CR34","unstructured":"Singer, U., et\u00a0al.: Make-a-video: text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792 (2022)"},{"key":"3_CR35","unstructured":"Sohn, K., et\u00a0al.: Styledrop: text-to-image generation in any style. arXiv preprint arXiv:2306.00983 (2023)"},{"key":"3_CR36","unstructured":"Song, Y., Sohl-Dickstein, J., Kingma, D.P., Kumar, A., Ermon, S., Poole, B.: Score-based generative modeling through stochastic differential equations (2021)"},{"key":"3_CR37","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"402","DOI":"10.1007\/978-3-030-58536-5_24","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Z Teed","year":"2020","unstructured":"Teed, Z., Deng, J.: RAFT: recurrent all-pairs field transforms for optical flow. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12347, pp. 402\u2013419. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58536-5_24"},{"key":"3_CR38","doi-asserted-by":"crossref","unstructured":"Tewel, Y., Gal, R., Chechik, G., Atzmon, Y.: Key-locked rank one editing for text-to-image personalization. In: ACM SIGGRAPH 2023 Conference Proceedings, pp. 1\u201311 (2023)","DOI":"10.1145\/3588432.3591506"},{"key":"3_CR39","unstructured":"Wang, W., et al.: Zero-shot video editing using off-the-shelf image diffusion models. arXiv preprint arXiv:2303.17599 (2023)"},{"key":"3_CR40","doi-asserted-by":"crossref","unstructured":"Wei, Y., Zhang, Y., Ji, Z., Bai, J., Zhang, L., Zuo, W.: Elite: encoding visual concepts into textual embeddings for customized text-to-image generation (2023)","DOI":"10.1109\/ICCV51070.2023.01461"},{"key":"3_CR41","doi-asserted-by":"publisher","unstructured":"Wu, C., Liang, J., Ji, L., Yang, F., Fang, Y., Jiang, D., Duan, N.: N\u00fcwa: Visual synthesis pre-training for neural visual world creation. In: European Conference on Computer Vision, pp. 720\u2013736. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19787-1_41","DOI":"10.1007\/978-3-031-19787-1_41"},{"key":"3_CR42","doi-asserted-by":"crossref","unstructured":"Wu, J.Z., et al.: Tune-a-video: one-shot tuning of image diffusion models for text-to-video generation (2023)","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"3_CR43","unstructured":"Wu, J.Z., et al.: Cvpr 2023 text guided video editing competition (2023)"},{"key":"3_CR44","doi-asserted-by":"crossref","unstructured":"Wu, R., Chen, L., Yang, T., Guo, C., Li, C., Zhang, X.: Lamp: learn a motion pattern for few-shot-based video generation. arXiv preprint arXiv:2310.10769 (2023)","DOI":"10.1109\/CVPR52733.2024.00677"},{"key":"3_CR45","doi-asserted-by":"crossref","unstructured":"Xiao, G., Yin, T., Freeman, W.T., Durand, F., Han, S.: Fastcomposer: tuning-free multi-subject image generation with localized attention. arXiv preprint arXiv:2305.10431 (2023)","DOI":"10.1007\/s11263-024-02227-z"},{"key":"3_CR46","doi-asserted-by":"crossref","unstructured":"Xu, H., Zhang, J., Cai, J., Rezatofighi, H., Tao, D.: Gmflow: learning optical flow via global matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8121\u20138130 (2022)","DOI":"10.1109\/CVPR52688.2022.00795"},{"key":"3_CR47","doi-asserted-by":"crossref","unstructured":"Yang, S., Zhou, Y., Liu, Z., Loy, C.C.: Rerender a video: zero-shot text-guided video-to-video translation. arXiv preprint arXiv:2306.07954 (2023)","DOI":"10.1145\/3610548.3618160"},{"key":"3_CR48","doi-asserted-by":"crossref","unstructured":"Yuan, H., et al.: Instructvideo: instructing video diffusion models with human feedback (2023)","DOI":"10.1109\/CVPR52733.2024.00618"},{"key":"3_CR49","doi-asserted-by":"crossref","unstructured":"Zhang, D.J., et al.: Show-1: marrying pixel and latent diffusion models for text-to-video generation. arXiv preprint arXiv:2309.15818 (2023)","DOI":"10.1007\/s11263-024-02271-9"},{"key":"3_CR50","unstructured":"Zhang, Y., Wei, Y., Jiang, D., Zhang, X., Zuo, W., Tian, Q.: Controlvideo: training-free controllable text-to-video generation. arXiv preprint arXiv:2305.13077 (2023)"},{"key":"3_CR51","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Xing, Z., Zeng, Y., Fang, Y., Chen, K.: Pia: your personalized image animator via plug-and-play modules in text-to-image models (2023)","DOI":"10.1109\/CVPR52733.2024.00740"},{"key":"3_CR52","unstructured":"Zhang, Z., Li, B., Nie, X., Han, C., Guo, T., Liu, L.: Towards consistent video editing with text-to-image diffusion models. Adv. Neural Inf. Process. Syst. (2024)"},{"key":"3_CR53","unstructured":"Zhao, R., et al.: Motiondirector: motion customization of text-to-video diffusion models. arXiv preprint arXiv:2310.08465 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72989-8_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T08:29:42Z","timestamp":1732955382000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72989-8_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,26]]},"ISBN":["9783031729881","9783031729898"],"references-count":53,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72989-8_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,26]]},"assertion":[{"value":"26 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}