{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:47:34Z","timestamp":1778082454566,"version":"3.51.4"},"publisher-location":"Cham","reference-count":62,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726903","type":"print"},{"value":"9783031726910","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72691-0_4","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T18:02:28Z","timestamp":1730570548000},"page":"55-72","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":13,"title":["Lazy Diffusion Transformer for\u00a0Interactive Image Editing"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8851-6279","authenticated-orcid":false,"given":"Yotam","family":"Nitzan","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9190-1717","authenticated-orcid":false,"given":"Zongze","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2507-4674","authenticated-orcid":false,"given":"Richard","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6783-1795","authenticated-orcid":false,"given":"Eli","family":"Shechtman","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6777-7445","authenticated-orcid":false,"given":"Daniel","family":"Cohen-Or","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9534-6868","authenticated-orcid":false,"given":"Taesung","family":"Park","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4190-6955","authenticated-orcid":false,"given":"Micha\u00ebl","family":"Gharbi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"key":"4_CR1","doi-asserted-by":"crossref","unstructured":"Avrahami, O., Lischinski, D., Fried, O.: Blended diffusion for text-driven editing of natural images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18208\u201318218 (2022)","DOI":"10.1109\/CVPR52688.2022.01767"},{"key":"4_CR2","unstructured":"Betker, J., et\u00a0al.: Improving image generation with better captions. Comput. Sci. 2, 3 (2023). https:\/\/cdn.openai.com\/papers\/dall-e-3.pdf"},{"key":"4_CR3","unstructured":"Brooks, T., et al.: Video generation models as world simulators (2024). https:\/\/openai.com\/research\/video-generation-models-as-world-simulators"},{"key":"4_CR4","doi-asserted-by":"crossref","unstructured":"Cao, M., Wang, X., Qi, Z., Shan, Y., Qie, X., Zheng, Y.: MasaCtrl: tuning-free mutual self-attention control for consistent image synthesis and editing. arXiv preprint arXiv:2304.08465 (2023)","DOI":"10.1109\/ICCV51070.2023.02062"},{"key":"4_CR5","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"4_CR6","doi-asserted-by":"crossref","unstructured":"Chang, H., Zhang, H., Jiang, L., Liu, C., Freeman, W.T.: MaskGIT: masked generative image transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11315\u201311325 (2022)","DOI":"10.1109\/CVPR52688.2022.01103"},{"key":"4_CR7","unstructured":"Chen, J., et al.: Pixart-$$\\alpha $$: fast training of diffusion transformer for photorealistic text-to-image synthesis (2023)"},{"key":"4_CR8","unstructured":"Chen, M., et al.: Generative pretraining from pixels. In: International Conference on Machine Learning, pp. 1691\u20131703. PMLR (2020)"},{"key":"4_CR9","unstructured":"Couairon, G., Verbeek, J., Schwenk, H., Cord, M.: DiffEdit: diffusion-based semantic image editing with mask guidance. arXiv preprint arXiv:2210.11427 (2022)"},{"key":"4_CR10","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"4_CR11","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis. In: Advances in Neural Information Processing Systems, vol. 34, pp. 8780\u20138794 (2021)"},{"key":"4_CR12","unstructured":"Ding, M., Zheng, W., Hong, W., Tang, J.: CogView2: faster and better text-to-image generation via hierarchical transformers. In: Advances in Neural Information Processing Systems, vol. 35, pp. 16890\u201316902 (2022)"},{"key":"4_CR13","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth $$16 \\times 16$$ words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"4_CR14","unstructured":"Esser, P., et al.: Scaling rectified flow transformers for high-resolution image synthesis (2024)"},{"key":"4_CR15","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12873\u201312883 (2021)","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"4_CR16","doi-asserted-by":"crossref","unstructured":"Gao, S., Zhou, P., Cheng, M.M., Yan, S.: Masked diffusion transformer is a strong image synthesizer. arXiv preprint arXiv:2303.14389 (2023)","DOI":"10.1109\/ICCV51070.2023.02117"},{"key":"4_CR17","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"4_CR18","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., Cohen-Or, D.: Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626 (2022)"},{"key":"4_CR19","doi-asserted-by":"crossref","unstructured":"Hessel, J., Holtzman, A., Forbes, M., Bras, R.L., Choi, Y.: CLIPScore: a reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"4_CR20","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local Nash equilibrium. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"4_CR21","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: Advances in Neural Information Processing Systems, vol. 33, pp. 6840\u20136851 (2020)"},{"key":"4_CR22","unstructured":"Karras, T., Aittala, M., Aila, T., Laine, S.: Elucidating the design space of diffusion-based generative models. In: Advances in Neural Information Processing Systems, vol. 35, pp. 26565\u201326577 (2022)"},{"key":"4_CR23","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"4_CR24","unstructured":"Li, M., Lin, J., Meng, C., Ermon, S., Han, S., Zhu, J.Y.: Efficient spatially sparse inference for conditional GANs and diffusion models. In: Advances in Neural Information Processing Systems, vol. 35, pp. 28858\u201328873 (2022)"},{"key":"4_CR25","doi-asserted-by":"crossref","unstructured":"Liu, W., Cun, X., Pun, C.M., Xia, M., Zhang, Y., Wang, J.: CoordFill: efficient high-resolution image inpainting via parameterized coordinate querying. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a037, pp. 1746\u20131754 (2023)","DOI":"10.1609\/aaai.v37i2.25263"},{"key":"4_CR26","unstructured":"Liu, X., Zhang, X., Ma, J., Peng, J., Liu, Q.: InstaFlow: one step is enough for high-quality diffusion-based text-to-image generation. arXiv preprint arXiv:2309.06380 (2023)"},{"key":"4_CR27","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"4_CR28","unstructured":"Lu, C., Zhou, Y., Bao, F., Chen, J., Li, C., Zhu, J.: DPM-solver: a fast ODE solver for diffusion probabilistic model sampling in around 10 steps. In: Advances in Neural Information Processing Systems, vol. 35, pp. 5775\u20135787 (2022)"},{"key":"4_CR29","unstructured":"Lu, C., Zhou, Y., Bao, F., Chen, J., Li, C., Zhu, J.: DPM-solver++: fast solver for guided sampling of diffusion probabilistic models. arXiv preprint arXiv:2211.01095 (2022)"},{"key":"4_CR30","unstructured":"Luo, S., Tan, Y., Huang, L., Li, J., Zhao, H.: Latent consistency models: synthesizing high-resolution images with few-step inference. arXiv preprint arXiv:2310.04378 (2023)"},{"key":"4_CR31","unstructured":"Meng, C., et al.: SDEdit: guided image synthesis and editing with stochastic differential equations. arXiv preprint arXiv:2108.01073 (2021)"},{"key":"4_CR32","doi-asserted-by":"crossref","unstructured":"Nguyen, T.H., Tran, A.: SwiftBrush: one-step text-to-image diffusion model with variational score distillation. arXiv preprint arXiv:2312.05239 (2023)","DOI":"10.1109\/CVPR52733.2024.00746"},{"key":"4_CR33","unstructured":"Nichol, A., et al.: Glide: towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)"},{"key":"4_CR34","unstructured":"Nichol, A.Q., Dhariwal, P.: Improved denoising diffusion probabilistic models. In: International Conference on Machine Learning, pp. 8162\u20138171. PMLR (2021)"},{"key":"4_CR35","doi-asserted-by":"crossref","unstructured":"Patashnik, O., Garibi, D., Azuri, I., Averbuch-Elor, H., Cohen-Or, D.: Localizing object-level shape variations with text-to-image diffusion models. arXiv preprint arXiv:2303.11306 (2023)","DOI":"10.1109\/ICCV51070.2023.02107"},{"key":"4_CR36","doi-asserted-by":"crossref","unstructured":"Peebles, W., Xie, S.: Scalable diffusion models with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4195\u20134205 (2023)","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"4_CR37","doi-asserted-by":"crossref","unstructured":"P\u00e9rez, P., Gangnet, M., Blake, A.: Poisson image editing. In: ACM SIGGRAPH 2003 Papers, pp. 313\u2013318 (2003)","DOI":"10.1145\/1201775.882269"},{"key":"4_CR38","unstructured":"von Platen, P., et al.: Diffusers: state-of-the-art diffusion models (2022). https:\/\/github.com\/huggingface\/diffusers"},{"key":"4_CR39","unstructured":"Podell, D., et al.: SDXL: improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)"},{"key":"4_CR40","doi-asserted-by":"crossref","unstructured":"Qi, L., et al.: Open world entity segmentation. IEEE Trans. Pattern Anal. Mach. Intell. (2022)","DOI":"10.1109\/TPAMI.2022.3227513"},{"issue":"1","key":"4_CR41","first-page":"5485","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(1), 5485\u20135551 (2020)","journal-title":"J. Mach. Learn. Res."},{"key":"4_CR42","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125, 1(2), 3 (2022)"},{"key":"4_CR43","unstructured":"Razavi, A., Van\u00a0den Oord, A., Vinyals, O.: Generating diverse high-fidelity images with VQ-VAE-2. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"4_CR44","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"4_CR45","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. In: Advances in Neural Information Processing Systems, vol. 35, pp. 36479\u201336494 (2022)"},{"key":"4_CR46","unstructured":"Salimans, T., Ho, J.: Progressive distillation for fast sampling of diffusion models. arXiv preprint arXiv:2202.00512 (2022)"},{"key":"4_CR47","unstructured":"Schuhmann, C., et al.: LAION-400m: open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)"},{"key":"4_CR48","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: International Conference on Machine Learning, pp. 2256\u20132265. PMLR (2015)"},{"key":"4_CR49","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)"},{"key":"4_CR50","unstructured":"Song, Y., Dhariwal, P., Chen, M., Sutskever, I.: Consistency models (2023)"},{"key":"4_CR51","unstructured":"Song, Y., Sohl-Dickstein, J., Kingma, D.P., Kumar, A., Ermon, S., Poole, B.: Score-based generative modeling through stochastic differential equations. arXiv preprint arXiv:2011.13456 (2020)"},{"key":"4_CR52","unstructured":"Tevet, G., Raab, S., Gordon, B., Shafir, Y., Cohen-Or, D., Bermano, A.H.: Human motion diffusion model. arXiv preprint arXiv:2209.14916 (2022)"},{"key":"4_CR53","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"4_CR54","doi-asserted-by":"crossref","unstructured":"Wang, S., et\u00a0al.: Imagen editor and editbench: advancing and evaluating text-guided image inpainting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18359\u201318369 (2023)","DOI":"10.1109\/CVPR52729.2023.01761"},{"key":"4_CR55","unstructured":"Stable-diffusion webui: stable-diffusion-webui (2024). https:\/\/github.com\/AUTOMATIC1111\/stable-diffusion-webui. Accessed Jan 2024"},{"key":"4_CR56","unstructured":"Wei, C., et al.: Diffusion models as masked autoencoders. arXiv preprint arXiv:2304.03283 (2023)"},{"key":"4_CR57","doi-asserted-by":"crossref","unstructured":"Xie, S., Zhang, Z., Lin, Z., Hinz, T., Zhang, K.: SmartBrush: text and shape guided object inpainting with diffusion model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22428\u201322437 (2023)","DOI":"10.1109\/CVPR52729.2023.02148"},{"key":"4_CR58","unstructured":"Yin, T., et al.: One-step diffusion with distribution matching distillation. arXiv preprint arXiv:2311.18828 (2023)"},{"key":"4_CR59","doi-asserted-by":"crossref","unstructured":"Yu, J., Lin, Z., Yang, J., Shen, X., Lu, X., Huang, T.S.: Free-form image inpainting with gated convolution. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4471\u20134480 (2019)","DOI":"10.1109\/ICCV.2019.00457"},{"key":"4_CR60","unstructured":"Zhao, S., et al.: Large scale image completion via co-modulated generative adversarial networks. In: International Conference on Learning Representations (ICLR) (2021)"},{"key":"4_CR61","unstructured":"Zheng, H., Nie, W., Vahdat, A., Anandkumar, A.: Fast training of diffusion models with masked transformers. arXiv preprint arXiv:2306.09305 (2023)"},{"key":"4_CR62","unstructured":"Zhu, Z., et al.: Designing a better asymmetric VQGAN for stablediffusion. arXiv preprint arXiv:2306.04632 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72691-0_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T18:03:28Z","timestamp":1730570608000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72691-0_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031726903","9783031726910"],"references-count":62,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72691-0_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}