{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T17:29:31Z","timestamp":1767979771627,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":30,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819543779","type":"print"},{"value":"9789819543786","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,9]],"date-time":"2025-11-09T00:00:00Z","timestamp":1762646400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,9]],"date-time":"2025-11-09T00:00:00Z","timestamp":1762646400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-4378-6_10","type":"book-chapter","created":{"date-parts":[[2025,11,10]],"date-time":"2025-11-10T15:52:54Z","timestamp":1762789974000},"page":"137-152","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["RealCraft: Attention Control as\u00a0A Tool for\u00a0Zero-Shot Consistent Video Editing"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0611-4239","authenticated-orcid":false,"given":"Shutong","family":"Jin","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7672-970X","authenticated-orcid":false,"given":"Ruiyu","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1114-6040","authenticated-orcid":false,"given":"Florian T.","family":"Pokorny","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,9]]},"reference":[{"key":"10_CR1","doi-asserted-by":"crossref","unstructured":"Bar-Tal, O., Ofri-Amar, D., Fridman, R., Kasten, Y., Dekel, T.: Text2LIVE: text-driven layered image and video editing. In: European Conference on Computer Vision, pp. 707\u2013723. Springer (2022)","DOI":"10.1007\/978-3-031-19784-0_41"},{"key":"10_CR2","unstructured":"Betker, J., et\u00a0al.: Improving image generation with better captions (2023)"},{"key":"10_CR3","doi-asserted-by":"crossref","unstructured":"Chai, W., Guo, X., Wang, G., Lu, Y.: StableVideo: text-driven consistency-aware diffusion video editing. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 23040\u201323050 (2023)","DOI":"10.1109\/ICCV51070.2023.02106"},{"key":"10_CR4","unstructured":"Couairon, P., Rambour, C., Haugeard, J.E., Thome, N.: VidEdit: Zero-shot and spatially aware text-driven video editing. arXiv preprint arXiv:2306.08707 (2023)"},{"key":"10_CR5","doi-asserted-by":"crossref","unstructured":"Esser, P., Chiu, J., Atighehchian, P., Granskog, J., Germanidis, A.: Structure and content-guided video synthesis with diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7346\u20137356 (2023)","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"10_CR6","unstructured":"Geyer, M., Bar-Tal, O., Bagon, S., Dekel, T.: TokenFlow: Consistent diffusion features for consistent video editing. arXiv preprint arXiv:2307.10373 (2023)"},{"key":"10_CR7","unstructured":"He, Y., Salakhutdinov, R., Kolter, J.Z.: Localized text-to-image generation for free via cross attention control. arXiv preprint arXiv:2306.14636 (2023)"},{"key":"10_CR8","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., Cohen-Or, D.: Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626 (2022)"},{"key":"10_CR9","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"10_CR10","unstructured":"Jeong, H., Ye, J.C.: Ground-a-video: Zero-shot grounded video editing using text-to-image diffusion models. arXiv preprint arXiv:2310.01107 (2023)"},{"issue":"6","key":"10_CR11","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3478513.3480546","volume":"40","author":"Y Kasten","year":"2021","unstructured":"Kasten, Y., Ofri, D., Wang, O., Dekel, T.: Layered neural atlases for consistent video editing. ACM Trans. Graph. (TOG) 40(6), 1\u201312 (2021)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"10_CR12","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)"},{"key":"10_CR13","doi-asserted-by":"crossref","unstructured":"Lee, Y.C., Jang, J.Z.G., Chen, Y.T., Qiu, E., Huang, J.B.: Shape-aware text-driven layered video editing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14317\u201314326 (2023)","DOI":"10.1109\/CVPR52729.2023.01376"},{"key":"10_CR14","unstructured":"Liao, Z., Deng, Z.: LOVECon: Text-driven training-free long video editing with ControlNet. arXiv preprint arXiv:2310.09711 (2023)"},{"key":"10_CR15","doi-asserted-by":"crossref","unstructured":"Liu, S., Zhang, Y., Li, W., Lin, Z., Jia, J.: Video-P2P: Video editing with cross-attention control. arXiv preprint arXiv:2303.04761 (2023)","DOI":"10.1109\/CVPR52733.2024.00821"},{"key":"10_CR16","doi-asserted-by":"crossref","unstructured":"Lu, T., et al.: Fuse your latents: Video editing with multi-source latent diffusion models. arXiv preprint arXiv:2310.16400 (2023)","DOI":"10.1145\/3664647.3680683"},{"key":"10_CR17","doi-asserted-by":"crossref","unstructured":"Parmar, G., Kumar\u00a0Singh, K., Zhang, R., Li, Y., Lu, J., Zhu, J.Y.: Zero-shot image-to-image translation. In: ACM SIGGRAPH 2023 Conference Proceedings, pp. 1\u201311 (2023)","DOI":"10.1145\/3588432.3591513"},{"key":"10_CR18","unstructured":"Pont-Tuset, J., Perazzi, F., Caelles, S., Arbel\u00e1ez, P., Sorkine-Hornung, A., Van\u00a0Gool, L.: The 2017 Davis challenge on video object segmentation. arXiv preprint arXiv:1704.00675 (2017)"},{"key":"10_CR19","doi-asserted-by":"crossref","unstructured":"Qi, C., et al.: FateZero: Fusing attentions for zero-shot text-based video editing. arXiv preprint arXiv:2303.09535 (2023)","DOI":"10.1109\/ICCV51070.2023.01460"},{"key":"10_CR20","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"10_CR21","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"10_CR22","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"10_CR23","doi-asserted-by":"publisher","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"10_CR24","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)"},{"issue":"4","key":"10_CR25","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3450626.3459838","volume":"40","author":"O Tov","year":"2021","unstructured":"Tov, O., Alaluf, Y., Nitzan, Y., Patashnik, O., Cohen-Or, D.: Designing an encoder for StyleGAN image manipulation. ACM Trans. Graph. (TOG) 40(4), 1\u201314 (2021)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"10_CR26","doi-asserted-by":"crossref","unstructured":"Wu, J.Z., et al.: Tune-a-video: one-shot tuning of image diffusion models for text-to-video generation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7623\u20137633 (2023)","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"10_CR27","unstructured":"Yan, H., Liew, J.H., Mai, L., Lin, S., Feng, J.: Magicprop: Diffusion-based video editing via motion-aware appearance propagation. arXiv preprint arXiv:2309.00908 (2023)"},{"key":"10_CR28","doi-asserted-by":"crossref","unstructured":"Yang, S., Zhou, Y., Liu, Z., Loy, C.C.: Rerender a video: Zero-shot text-guided video-to-video translation. arXiv preprint arXiv:2306.07954 (2023)","DOI":"10.1145\/3610548.3618160"},{"key":"10_CR29","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3836\u20133847 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"10_CR30","unstructured":"Zhao, M., Wang, R., Bao, F., Li, C., Zhu, J.: ControlVideo: Adding conditional control for one shot text-to-video editing. arXiv preprint arXiv:2305.17098 (2023)"}],"container-title":["Lecture Notes in Computer Science","Neural Information Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-4378-6_10","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T05:59:00Z","timestamp":1767938340000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-4378-6_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,9]]},"ISBN":["9789819543779","9789819543786"],"references-count":30,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-4378-6_10","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,9]]},"assertion":[{"value":"9 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICONIP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Neural Information Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Okinawa","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 November 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24 November 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"32","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iconip2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/iconip2025.apnns.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}